In [3]:
###########################
### MODULES NECESSAIRES ###
###########################


# Import the necessary libraries
import numpy as np
import pandas as pd
import math
import fonctionsSupervisedLearning1 as fsl

from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn import svm

from auxFonctions import AminoAcid

In [4]:
# Read data from a file into a list of entries
with open('data/SIG_13.red', 'r') as file:
    entries = file.read().split('\n   ')



In [5]:
##############################
## RECUPERATION DES DONNÉES ##
##############################


# Process each entry
processed_entries = [fsl.process_entry(entry) for entry in entries]

# Create a DataFrame
df = pd.DataFrame(processed_entries)

# Get the position of the cleavage site
df['position'] = df['Annotation'].apply(lambda x: x.find('C'))

# Split the primary structure into a list of amino acids
amino_acid_seq = df['Primary Structure'].apply(lambda x: list(x))

## Traitement des données

Put words in vector and vice versa

In [6]:
df_exploitable = fsl.convert_df_to_vectors(df)
df_exploitable.to_csv('data/df_exploitable.csv', index=False)

In [7]:
def create_model_pq(df, kernel = 'rbf', C = 10, p = 2, q = 13, random_state = 42):
    gamma = 0.1
    
    X_train, X_test, pos_train, pos_test = fsl.test_train_split_random_pos(df, p+q, random_state=random_state)
    pos_train = np.array(pos_train==p)
    print(X_train[pos_train==1])
    pos_test = np.array(pos_test==p)
    svm_model = svm.SVC(kernel=kernel, C=C,gamma=gamma, random_state=random_state)
    svm_model.fit(X_train, pos_train)
    pos_predict = svm_model.predict(X_test)
    accuracy = accuracy_score(pos_test,pos_predict)
    # accuracy = 0
    # svm_model = 0
    return svm_model,accuracy

def find_cleavage_pq (X, svm_model, p:int = 2, q:int = 13, nb_letters = 26):
    '''
    find the position of the cleavage site in the primary structure using two SVM models
    /!\ the models must be trained before using this function with the same n and nb_letters as the ones used in this function
    ### Parameters:
    - X: the primary structure as a vector
    - svm_model_in: the SVM model that predicts if the subsequence contains the cleavage site
    - svm_model_pos: the SVM model that predicts the position of the cleavage site in the subsequence
    - threshold: the threshold for the confidence of the prediction
    ### Returns:
    - the position of the cleavage site if the prediction is confident enough, otherwise Nan
    '''
    positions = []
    for i in range(p*nb_letters, len(X)- q*nb_letters, nb_letters):
        test_sub = X[i-p*nb_letters :i + q*nb_letters]
        
        if svm_model.predict([test_sub]):
            position = i//nb_letters
            # positions.append(position.item())
            return position
    return math.nan



Train

In [8]:
#extract neighboorhood p+q of cleavage site
def extract_neighboorhood(df, p:int = 2, q:int = 13, nb_letters = 26):
    '''
    extract the neighborhood of the cleavage site in the primary structure
    ### Parameters:
    - df: the dataframe containing the primary structure
    - p: the number of amino acids before the cleavage site
    - q: the number of amino acids after the cleavage site
    - nb_letters: the number of amino acids in the alphabet
    ### Returns:
    - the dataframe with the neighborhood of the cleavage site
    '''
    df_neigh = df.copy()
    for i in range(df.shape[0]):
        X = df.iloc[i]['Primary Structure']
        position = df.iloc[i]['position']
        if position >= p and position <= len(X)-q:
            neigh = X[(position-p):(position+q)]

            df_neigh.at[i,'Primary Structure'] = neigh
            df_neigh.at[i,'complete'] = True

        elif position >= p:
        # complete with Xs if the neighborhood is not complete
            neigh = X[position-p:] + ['X']*(p+q-len(X[position-p:]))
            df_neigh.at[i,'Primary Structure'] = neigh
            df_neigh.at[i,'complete'] = False
            
        else:
            neigh = ['X']*(p-len(X[:position])) + X[:position+q]
            df_neigh.at[i,'Primary Structure'] = neigh
            df_neigh.at[i,'complete'] = False
    # set cleavage to true to all the lines
    df_neigh['cleavage'] = True
    #delete Protein ID and  Annotation position columns
    df_neigh = df_neigh.drop(columns=['Protein ID','position','Annotation'])

   

    df_neigh = df_neigh[df_neigh['complete'] == True].drop(columns=['complete'])
    return df_neigh


df_neigh = extract_neighboorhood(df, p=2, q=13, nb_letters=26)
df_neigh = fsl.convert_df_to_vectors(df_neigh)
# df_neigh = df_neigh[df_neigh['complete'] == True]
df_neigh


Unnamed: 0,Primary Structure,cleavage,P_Structure_vector
0,IARHQQRQQQQNQCQ,True,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
1,LSQIEQQSPWEFQGS,True,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,WAGSHSMRYFYTSVS,True,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,SAAPANQFIKTSCTL,True,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,NASIYRTVVEFEEDD,True,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...
1403,FAQDFCSNAQHSGQK,True,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
1404,AACTYTIDSEWSTGF,True,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1405,FAEEPEDGNDGIPRL,True,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
1406,VAATSTVTGGYAQSD,True,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [9]:
def extract_random_from_row(row, p:int, q, nb_letters:int=26):
    '''
    Extract a random subsequence of length n from the primary structure and the annotation
    ### Parameters:
    - row: a row of the dataframe
    - p+q: the length of the subsequence
    - nb_letters: the number of letters in the alphabet
    ### Returns:
    - a pandas series containing the subsequence of the primary structure'''
    n = p+q
    max_start_index = max(0, len(row['Primary Structure']) - n)  # Calculate the maximum possible start index
    if max_start_index == 0:
        start_index = 0  # if chain is too short, start at the beginning
    else:
        start_index = np.random.randint(0, max_start_index)  # Randomly select a start index
    end_index = start_index + n  # Calculer l'indice de fin

 # Calculate the position of the cleavage site in the subsequence
    cleavage = False  # Initialize the cleavage variable
    if row['position'] == p :  # If the cleavage site is in the right place in the subsequence
        cleavage = True
         # If the cleavage site is not in the subsequence, set it to Nan
        

    return pd.Series([row['Primary Structure'][start_index:end_index], cleavage], index=['Primary Structure','cleavage'])

def extract_random_sequence(df, p, q):
    '''
    extract random sequences of length p+q from the primary structure
    ### Parameters:
    - df: the dataframe containing the primary structure
    - p: the number of amino acids before the cleavage site
    - q: the number of amino acids after the cleavage site
    ### Returns:
    - the dataframe with random sequences of length p+q
    '''
    df_random = df.apply(lambda x: extract_random_from_row(x, p,q), axis=1)
    return df_random

df_random = extract_random_sequence(df, p=2, q=13)
df_random = fsl.convert_df_to_vectors(df_random)
df_random



Unnamed: 0,Primary Structure,cleavage,P_Structure_vector
0,KATLLLAFTLLFATC,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,FLCLAVFINGCLSQI,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
2,RYFYTSVSRPGRGEP,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,SALLVILAAASAAPA,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,DDASNPVGPRQRCQK,False,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...
1403,AFAQDFCSNAQHSGQ,False,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1404,STGAALAILSQAASA,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1405,PYAFAEEPEDGNDGI,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1406,LAFTAGTSVAATSTV,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [10]:
def train_test_split2 (df_neigh, df_random, test_size = 0.2, random_state = 42):
    '''
    split the dataframe into a training and a testing set and convert it to vectors
    ### Parameters:
    - df_neigh: the dataframe containing the neighborhood of the cleavage site
    - df_random: the dataframe containing random sequences
    - test_size: the proportion of the testing set
    - random_state: the random state
    ### Returns:
    - the training and testing sets
    '''
    #concatenate the two dataframes
    df = pd.concat([df_neigh, df_random])
    #split the dataframe into a training and a testing set
    df_vectors = fsl.convert_df_to_vectors(df)

    X = np.array(df_vectors['P_Structure_vector'].tolist())
    y = np.array(df_vectors['cleavage'].tolist())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=test_size, random_state=random_state)

    return X_train, X_test, y_train, y_test

def train_test_split_full(df, p, q, test_size = 0.2, random_state = 42, nb_neg = 2):
    '''
    split the dataframe into a training and a testing set
    ### Parameters:
    - df: the dataframe containing the neighborhood of the cleavage site
    - test_size: the proportion of the testing set
    - random_state: the random state
    ### Returns:
    - the training and testing sets
    '''
    df_neigh = extract_neighboorhood(df, p, q)
    df_random = extract_random_sequence(df, p, q)
    for i in range(nb_neg-1):
        df_random = pd.concat([df_random, extract_random_sequence(df, p, q)])
    return train_test_split2(df_neigh, df_random, test_size, random_state)

In [11]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'gamma': ['scale', 'auto', 0.01, 0.1, 1],  # Kernel coefficient for 'rbf', 'poly' and 'sigmoid'
    'kernel': ['rbf', 'poly', 'sigmoid']  # Different types of kernels
}
X_train, X_test, y_train, y_test = train_test_split_full(df, p=2, q=13, test_size=0.2, random_state=42)

# Initialize the classifier
svm_model = svm.SVC(random_state=42, class_weight='balanced')  # Using balanced class weights

# Setup the GridSearchCV
grid_search = GridSearchCV(svm_model, param_grid, cv=5, scoring='f1', verbose=2, n_jobs=-1)

# Assuming X_train and in_train have been defined as your features and target variable respectively
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

print("Best parameters for inclusion model:", grid_search.best_params_)
print("Best score for inclusion model:", grid_search.best_score_)


Fitting 5 folds for each of 60 candidates, totalling 300 fits


KeyboardInterrupt: 

In [None]:
# Parcourir la chaine de caractère pour trouver le cleavage site avec le modèle et le max de séparation
def find_cleavage (X, svm_model, p:int = 2, q:int = 13, nb_letters = 26):
    '''
    find the position of the cleavage site in the primary structure using two SVM models
    /!\ the models must be trained before using this function with the same n and nb_letters as the ones used in this function
    ### Parameters:
    - X: the primary structure as a vector
    - svm_model_in: the SVM model that predicts if the subsequence contains the cleavage site
    - svm_model_pos: the SVM model that predicts the position of the cleavage site in the subsequence
    - threshold: the threshold for the confidence of the prediction
    ### Returns:
    - the position of the cleavage site if the prediction is confident enough, otherwise Nan
    '''
    max_dist = 0
    max_position = None
    for i in range(p*nb_letters, len(X)- q*nb_letters, nb_letters):
        test_sub = X[i-p*nb_letters :i + q*nb_letters]
        
        dist = svm_model.decision_function([test_sub])
        if abs(dist) > max_dist:
            max_dist = abs(dist)
            max_position = i//nb_letters

    return max_position if max_dist > 0  else None  # return None if no prediction is confident enough

#test on all the dataframe
def test_all(df, svm_model, p:int = 2, q:int = 13, nb_letters = 26):
    '''
    test the model on all the dataframe
    ### Parameters:
    - df: the dataframe containing the primary structure
    - svm_model: the SVM model
    - p: the number of amino acids before the cleavage site
    - q: the number of amino acids after the cleavage site
    - nb_letters: the number of amino acids in the alphabet
    ### Returns:
    - the dataframe with the predictions
    '''
    df_test = fsl.convert_df_to_vectors(df)
    df_test['prediction'] = df_test['P_Structure_vector'].apply(lambda x: find_cleavage(x, svm_model, p, q, nb_letters))
    accuracy_score = sum(df_test['position'] == df_test['prediction'])/df_test.shape[0]
    return df_test, accuracy_score

df_test, accuracy_score = test_all(df, best_model, p=2, q=13, nb_letters=26)
print(accuracy_score)

0.006392045454545455


In [None]:
X_train, X_test, y_train, y_test = train_test_split_full(df, p=2, q=13, test_size=0.2, random_state=42,nb_neg=5)
best_model.fit(X_train, y_train)
#confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, best_model.predict(X_test))

print("score" , best_model.score(X_test, y_test))





score 0.8633136094674556


In [None]:
df_neigh = extract_neighboorhood(df, p=2, q=13, nb_letters=26)
df_neigh = fsl.convert_df_to_vectors(df_neigh)

predict = best_model.predict(df_neigh['P_Structure_vector'].tolist())
print("Frequency of predicted cleavage sites in the neighborhood:", sum(predict)/len(predict))

df_random = extract_random_sequence(df, p=2, q=13)
df_random = fsl.convert_df_to_vectors(df_random)

predict_random = best_model.predict(df_random['P_Structure_vector'].tolist())
print("Frequency of predicted cleavage sites in random sequences:", sum(predict_random)/len(predict_random))

Frequency of predicted cleavage sites in the neighborhood: 0.9360795454545454
Frequency of predicted cleavage sites in random sequences: 0.10582386363636363
