In [39]:
# Imports
from strkernel.mismatch_kernel import MismatchKernel
from strkernel.mismatch_kernel import MismatchTrie
from strkernel.mismatch_kernel import preprocess

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re

In [3]:
# Functions
#Get integers from chirality string
def toInt(chirality_string):
    buff = re.split(r"[();]",chirality_string)
    return [int(buff[1]),int(buff[2])]

#Get the chirality from the list of chirality strings
def getChirality(chirality_list):
    return np.array([toInt(i) for i in chirality_list])

In [48]:
# Load the data
training_set = pd.read_csv('../Data/training_set.csv')

0      TTTCTCCTCTCT
1      TCTTTCCTCTCT
2      TCTCTTCTCTCT
3      TCTCTCTTCTCT
4      TCTCTCCTTTCT
           ...     
97     TTTCTCCCCCCT
98     CCCCCCCCCCCT
99     CCCCCCCCCTTC
100    TTTTTCCCCCCC
101    TTCTCCCCCCCT
Name: Sequence, Length: 102, dtype: object

In [5]:
# Change labels to 0 and 1
training_set['Label'] = training_set['Label'].apply(lambda x: 1 if x == 'Y' else 0)
# Get the chirality
chirality = getChirality(training_set['Chirality'])
# Dataframe with the chirality
chirality_df = pd.DataFrame(chirality,columns=['m','n'])

In [75]:
# Preprocess
x = preprocess(training_set['Sequence'])
# Compute mismatch kernels
kernels = MismatchKernel( l = 4,
                          k = 5,
                          m = 1 ).get_kernel(x).kernel

In [121]:
kernels_df = pd.DataFrame(kernels)
#Keep every 12th kernel
for(i) in range(kernels.__len__()):
    if(i%12!=0):
        kernels_df.drop(i,axis=1,inplace=True)
kernels_df.head(10)

Unnamed: 0,0,12,24,36,48,60,72,84,96,108,...,852,864,876,888,900,912,924,936,948,960
0,1.0,0.758947,0.715055,0.715055,0.769488,0.86,0.890871,0.79,0.709039,0.709039,...,0.491354,0.312622,0.338531,0.353089,0.680738,0.322126,0.41,0.484934,0.362392,0.176777
1,1.0,0.758947,0.715055,0.715055,0.769488,0.86,0.890871,0.79,0.709039,0.709039,...,0.491354,0.312622,0.338531,0.353089,0.680738,0.322126,0.41,0.484934,0.362392,0.176777
2,1.0,0.758947,0.715055,0.715055,0.769488,0.86,0.890871,0.79,0.709039,0.709039,...,0.491354,0.312622,0.338531,0.353089,0.680738,0.322126,0.41,0.484934,0.362392,0.176777
3,1.0,0.758947,0.715055,0.715055,0.769488,0.86,0.890871,0.79,0.709039,0.709039,...,0.491354,0.312622,0.338531,0.353089,0.680738,0.322126,0.41,0.484934,0.362392,0.176777
4,1.0,0.758947,0.715055,0.715055,0.769488,0.86,0.890871,0.79,0.709039,0.709039,...,0.491354,0.312622,0.338531,0.353089,0.680738,0.322126,0.41,0.484934,0.362392,0.176777
5,1.0,0.758947,0.715055,0.715055,0.769488,0.86,0.890871,0.79,0.709039,0.709039,...,0.491354,0.312622,0.338531,0.353089,0.680738,0.322126,0.41,0.484934,0.362392,0.176777
6,1.0,0.758947,0.715055,0.715055,0.769488,0.86,0.890871,0.79,0.709039,0.709039,...,0.491354,0.312622,0.338531,0.353089,0.680738,0.322126,0.41,0.484934,0.362392,0.176777
7,1.0,0.758947,0.715055,0.715055,0.769488,0.86,0.890871,0.79,0.709039,0.709039,...,0.491354,0.312622,0.338531,0.353089,0.680738,0.322126,0.41,0.484934,0.362392,0.176777
8,1.0,0.758947,0.715055,0.715055,0.769488,0.86,0.890871,0.79,0.709039,0.709039,...,0.491354,0.312622,0.338531,0.353089,0.680738,0.322126,0.41,0.484934,0.362392,0.176777
9,1.0,0.758947,0.715055,0.715055,0.769488,0.86,0.890871,0.79,0.709039,0.709039,...,0.491354,0.312622,0.338531,0.353089,0.680738,0.322126,0.41,0.484934,0.362392,0.176777


In [122]:
#Merge the dataframes
data = pd.concat([kernels_df,chirality_df,training_set['Label']],axis=1)
data.head(10)

Unnamed: 0,0,12,24,36,48,60,72,84,96,108,...,888,900,912,924,936,948,960,m,n,Label
0,1.0,0.758947,0.715055,0.715055,0.769488,0.86,0.890871,0.79,0.709039,0.709039,...,0.353089,0.680738,0.322126,0.41,0.484934,0.362392,0.176777,10,2,N
1,1.0,0.758947,0.715055,0.715055,0.769488,0.86,0.890871,0.79,0.709039,0.709039,...,0.353089,0.680738,0.322126,0.41,0.484934,0.362392,0.176777,10,3,N
2,1.0,0.758947,0.715055,0.715055,0.769488,0.86,0.890871,0.79,0.709039,0.709039,...,0.353089,0.680738,0.322126,0.41,0.484934,0.362392,0.176777,11,1,N
3,1.0,0.758947,0.715055,0.715055,0.769488,0.86,0.890871,0.79,0.709039,0.709039,...,0.353089,0.680738,0.322126,0.41,0.484934,0.362392,0.176777,8,5,N
4,1.0,0.758947,0.715055,0.715055,0.769488,0.86,0.890871,0.79,0.709039,0.709039,...,0.353089,0.680738,0.322126,0.41,0.484934,0.362392,0.176777,9,6,N
5,1.0,0.758947,0.715055,0.715055,0.769488,0.86,0.890871,0.79,0.709039,0.709039,...,0.353089,0.680738,0.322126,0.41,0.484934,0.362392,0.176777,9,9,N
6,1.0,0.758947,0.715055,0.715055,0.769488,0.86,0.890871,0.79,0.709039,0.709039,...,0.353089,0.680738,0.322126,0.41,0.484934,0.362392,0.176777,7,3,N
7,1.0,0.758947,0.715055,0.715055,0.769488,0.86,0.890871,0.79,0.709039,0.709039,...,0.353089,0.680738,0.322126,0.41,0.484934,0.362392,0.176777,8,5,N
8,1.0,0.758947,0.715055,0.715055,0.769488,0.86,0.890871,0.79,0.709039,0.709039,...,0.353089,0.680738,0.322126,0.41,0.484934,0.362392,0.176777,11,2,N
9,1.0,0.758947,0.715055,0.715055,0.769488,0.86,0.890871,0.79,0.709039,0.709039,...,0.353089,0.680738,0.322126,0.41,0.484934,0.362392,0.176777,8,8,N


In [123]:
# Split the data
X_train, X_test, Y_train, Y_test = train_test_split(data.drop(['Label'],axis=1),
                                                                        data['Label'],
                                                                        test_size=0.2,
                                                                        random_state=42)
# Model
model = SVC(kernel='linear')
# Train
model.fit(X_train,Y_train)



SVC(kernel='linear')