In [1]:
###########################
### MODULES NECESSAIRES ###
###########################


# Import the necessary libraries
import numpy as np
import pandas as pd
import math
import fonctionsSupervisedLearning as fsl

from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn import svm

from auxFonctions import AminoAcid

In [2]:
# Read data from a file into a list of entries
with open('data/SIG_13.red', 'r') as file:
    entries = file.read().split('\n   ')



In [3]:
##############################
## RECUPERATION DES DONNÉES ##
##############################


# Process each entry
processed_entries = [fsl.process_entry(entry) for entry in entries]

# Create a DataFrame
df = pd.DataFrame(processed_entries)

# Get the position of the cleavage site
cleavage_site_position = df['Annotation'].apply(lambda x: x.find('C'))

# Split the primary structure into a list of amino acids
amino_acid_seq = df['Primary Structure'].apply(lambda x: list(x))

## Traitement des données

Put words in vector and vice versa

In [4]:
df_exploitable = fsl.convert_df_to_vectors(df)

In [24]:
def create_model_pq(df, kernel = 'linear', C = 1, p = 2, q = 13, random_state = 42):
    
    X_train, X_test, pos_train, pos_test = fsl.test_train_split_random_pos(df, p+q, random_state=random_state)
    pos_train = np.array(pos_train==p)
    print(X_train[pos_train==1])
    pos_test = np.array(pos_test==p)
    svm_model = svm.SVC(kernel=kernel, C=C, random_state=random_state)
    svm_model.fit(X_train, pos_train)
    pos_predict = svm_model.predict(X_test)
    accuracy = accuracy_score(pos_test,pos_predict)
    # accuracy = 0
    # svm_model = 0
    return svm_model,accuracy

def find_cleavage_pq (X, svm_model, p:int = 2, q:int = 13, nb_letters = 26):
    '''
    find the position of the cleavage site in the primary structure using two SVM models
    /!\ the models must be trained before using this function with the same n and nb_letters as the ones used in this function
    ### Parameters:
    - X: the primary structure as a vector
    - svm_model_in: the SVM model that predicts if the subsequence contains the cleavage site
    - svm_model_pos: the SVM model that predicts the position of the cleavage site in the subsequence
    - threshold: the threshold for the confidence of the prediction
    ### Returns:
    - the position of the cleavage site if the prediction is confident enough, otherwise Nan
    '''
    positions = []
    for i in range(p*nb_letters, len(X)- q*nb_letters, nb_letters):
        test_sub = X[i-p*nb_letters :i + q*nb_letters]
        
        if svm_model.predict([test_sub]):
            position = i//nb_letters
            # positions.append(position.item())
            return position
    return math.nan



Train

In [25]:
# svm_model_in, svm_model_pos, accuracy_in, accuracy_pos = fsl.create_model(12, df_exploitable, random_state=42, nb_letters = 26, kernel_in = 'rbf', kernel_pos = 'linear', C_in = 1, C_pos = 1)

# df_exploitable_100 = df_exploitable.iloc[:,1:100]

model, accuracy = create_model_pq(df_exploitable)
print(accuracy)


[[1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
0.9468085106382979


In [22]:
# print(fsl.find_cleavage(test, threshold = 0.1))
pos_pred = np.array([find_cleavage_pq(X,model) for X in df_exploitable['P_Structure_vector']])

in_pred = 1-np.isnan(pos_pred)

pos_pred[in_pred == 0] = 0

print(accuracy_score(df_exploitable["Annotation_pos"], pos_pred))

0.11860795454545454
