In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.cluster import KMeans
#import shogun as sg
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from strkernel.mismatch_kernel import preprocess, MismatchKernel
from strkernel.motifkernel import motifKernel

In [3]:
df = pd.read_csv('../data/raw/exercise_data/C_elegans_acc_seq.csv',header=None,names=['labels','sequences'])

In [4]:
df_human = pd.read_csv('../data/raw/exercise_data/human_dna_train_split.csv')

In [5]:
df_human.head()

Unnamed: 0,sequences,labels
0,TTGTGTCCTACTTTTGTCCATTTGGAAAAATAATTGCATGACTACA...,-1
1,CTTTCCTTTATTTCTTCGTCAACTTAATATCCTTAGCAAAACAGGA...,-1
2,TACTTAAGAGGGGTAAGAAATATATAAACTAGTGCAACATTTTTCA...,-1
3,TAGGTTTCCAAGCAGCCCATTCCTGCCTGGCACCACAGGGATCCAT...,-1
4,GCATGAGCCACTGCGCCTGGCCTGGTTCATTGCTTCTTAGTGATGC...,-1


In [6]:
df.head()

Unnamed: 0,labels,sequences
0,1.0,ACTGGGATAATTTGAAACAATAAATTTTTTTTTGAATTGTAGGTGT...
1,1.0,ATTGATTGAATATTAATTGTTATTTGACGTTATTTTTTAAAGAACT...
2,1.0,TTTAAACTTCGATTTTTTTCAAATAAAACATATTTTTTTCAGCCAG...
3,1.0,TAGCCAGATTTTTAGCAGGTTTTAGCAGAAAAACGTTTTCAGACGA...
4,1.0,TAAACCGCCGATTCTTAAAATTAATTTTTCTTTCTTTTTCAGATGA...


In [7]:
len(df_human), len(df)

(500000, 2200)

In [8]:
def get_train_test_data(df, test_size = 0.2):
    labels = df.labels.values
    seq = df.sequences.values
    train, test, y_train, y_test = train_test_split(seq,labels,stratify=labels,random_state=42,test_size=test_size)
    return train, test, y_train, y_test

In [9]:
train, test, y_train, y_test = get_train_test_data(df, test_size=0.2)

In [29]:
kernel = motifKernel(train.tolist()).compute_matrix(train.tolist())

In [30]:
clf = SVC()
y_test[y_test == -1] = 0
clf.fit(kernel, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [31]:
print(metrics.classification_report(y_train,clf.predict(kernel)))

              precision    recall  f1-score   support

         0.0       0.91      1.00      0.95      1600
         1.0       0.00      0.00      0.00       160

    accuracy                           0.91      1760
   macro avg       0.45      0.50      0.48      1760
weighted avg       0.83      0.91      0.87      1760



  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
features_train = sg.StringCharFeatures(train.tolist(), sg.DNA)
WDSK = sg.WeightedDegreeStringKernel(5)
GMSK = sg.GaussianMatchStringKernel()
combined_kernel = sg.CombinedKernel()
combined_kernel.append_kernel(WDSK)
combined_kernel.append_kernel(GMSK)
combined_kernel.init(features_train, features_train)
features_test = sg.StringCharFeatures(test.tolist(),sg.DNA)

In [22]:
labels = sg.BinaryLabels(y_train)
libsvm = sg.LibSVM()
svm = sg.MKLClassification()
svm.set_interleaved_optimization_enabled(False)
svm.set_kernel(combined_kernel)
svm.set_labels(labels)
svm.train()

SystemError: [1;31m[ERROR][0m In file /home/conda/feedstock_root/build_artifacts/shogun-cpp_1540034967896/work/src/shogun/classifier/mkl/MKL.cpp line 896: Assertion R >= 0 failed!


In [15]:
preds_train = svm.apply(features_train).get_labels()
preds_train[preds_train == -1] = 0
y_train[y_train==-1] = 0
preds_test = svm.apply(features_test).get_labels()
preds_test[preds_test == -1] = 0
y_test[y_test==-1] = 0

SystemError: [1;31m[ERROR][0m In file /home/conda/feedstock_root/build_artifacts/shogun-cpp_1540034967896/work/src/shogun/kernel/Kernel.cpp line 118: Right hand side of features (CombinedFeatures) must be compatible with left hand side features (StringFeatures)


In [24]:
print(metrics.classification_report(y_test,preds_test,digits=3))

              precision    recall  f1-score   support

         0.0      0.980     0.998     0.989       400
         1.0      0.970     0.800     0.877        40

    accuracy                          0.980       440
   macro avg      0.975     0.899     0.933       440
weighted avg      0.979     0.980     0.979       440



In [25]:
print(metrics.classification_report(y_train,preds_train,digits=3))

              precision    recall  f1-score   support

         0.0      0.993     0.999     0.996      1600
         1.0      0.993     0.931     0.961       160

    accuracy                          0.993      1760
   macro avg      0.993     0.965     0.979      1760
weighted avg      0.993     0.993     0.993      1760



In [10]:
seq_single = [[s_ for s_ in s] for s in seq]

In [11]:
data = np.array(seq_single)
data.shape

(2200, 82)

In [12]:
vectorizer = CountVectorizer(analyzer='char',lowercase=False,ngram_range=(1,1))
X = vectorizer.fit_transform(seq)

In [13]:
X.shape

(2200, 4)

In [14]:
X[0,:].todense()

matrix([[24, 10, 19, 29]])

In [15]:
X = X/X.sum(axis=1)

In [16]:
X[0,:]

matrix([[0.29268293, 0.12195122, 0.23170732, 0.35365854]])

In [17]:
train, test, y_train, y_test = train_test_split(X,y,stratify=y,random_state=42,test_size=0.2)

In [18]:
counts = np.unique(y_test,return_counts=True)[1]
NIR = counts.max()/counts.sum()
NIR

0.9090909090909091

In [45]:
def my_kernel(x,y):
    d_ij = rel_entr(x+1e-6,y+1e-6)
    d_ji = rel_entr(y+1e-6,x+1e-6)
    return np.dot(d_ij, d_ji.T)

In [46]:
model = SVC(kernel='linear',random_state=42)

In [51]:
model.fit(train,y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale',
    kernel=<function my_kernel at 0x7fe54ac05430>, max_iter=-1,
    probability=False, random_state=42, shrinking=True, tol=0.001,
    verbose=False)

In [49]:
cross_val_score(model,train,y_train,cv=5,n_jobs=-1)

BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.