In [2]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
import sklearn.pipeline as pipe
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.svm import SVC
import nltk



In [3]:
#using pandas to import the original training_text and traning_variant
sms = pd.read_table('training_rare_text.txt', encoding= 'UTF8', sep="\\|\\|",header=None, engine='python', names=['cli_data'])
sms2= pd.read_table('training_variants.txt', encoding= 'UTF8', header=None,delimiter=',', engine='python', names=['gene','mutation','classification'])

In [303]:
sms.shape

(3321, 1)

In [304]:
#a glimpse of the type, doesn't matter
type(sms)

pandas.core.frame.DataFrame

In [305]:
sms2.shape

(3321, 3)

In [306]:
#the first 10 lines of the text file
sms.head(10)

Unnamed: 0,cli_data
0,Cyclin dependent kinases CDKs regulate variety...
1,Abstract Background Non small lung NSCLC heter...
2,Abstract Background Non small lung NSCLC heter...
3,Recent evidence demonstrated acquired uniparen...
4,Oncogenic monomeric Casitas B lineage lymphoma...
5,Oncogenic monomeric Casitas B lineage lymphoma...
6,Oncogenic monomeric Casitas B lineage lymphoma...
7,CBL negative regulator activated receptor tyro...
8,Abstract Juvenile myelomonocytic leukemia JMML...
9,Abstract Juvenile myelomonocytic leukemia JMML...


In [307]:
#first 10 lines of the variants file
sms2.head(10)

Unnamed: 0,gene,mutation,classification
0,FAM58A,Truncating Mutations,1
1,CBL,W802*,2
2,CBL,Q249E,2
3,CBL,N454D,3
4,CBL,L399V,4
5,CBL,V391I,4
6,CBL,V430M,5
7,CBL,Deletion,1
8,CBL,Y371H,4
9,CBL,C384R,4


In [308]:
#the classification is very imbalanced use this dictinaty as the "class-weight" parameter when do model fitting
weighted=sms2.classification.value_counts()
weight_dict=dict()
for key in weighted.keys():
    weight_dict[key]=weighted[key]/3311
print(weight_dict)

{7: 0.2878284506191483, 4: 0.20718816067653276, 1: 0.17154938085170643, 2: 0.13651464814255512, 6: 0.083056478405315617, 5: 0.073089700996677748, 3: 0.026880096647538508, 9: 0.01117487163998792, 8: 0.0057384475989127152}


In [5]:
#define the X and y to be feed to the model
X = sms.cli_data
y = sms2.classification

In [315]:
#split the training data into "train" and "test" when run the model
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(2490,)
(831,)
(2490,)
(831,)


In [6]:
#using kfold to split the data
kf=KFold(n_splits=3)
array_nb=[]
array_lr=[]
array_svc=[]
score_nb=0
score_lr=0
score_svc=0
i=0
for train_indices,test_indices in kf.split(X,y):
    X_train=X[train_indices]
    y_train=y[train_indices] 
    X_test=X[test_indices] 
    y_test=y[test_indices] 
    print(len(X_train))
    
    vect = CountVectorizer()
    X_train_dtm = vect.fit_transform(X_train)
    X_test_dtm = vect.transform(X_test)
    
    weighted=y_train.value_counts()
    weight_dict=dict()
    for key in weighted.keys():
        weight_dict[key]=weighted[key]/len(X_train)
    
    nb = MultinomialNB()
    nb.fit(X_train_dtm, y_train)
    y_pred_class_nb = nb.predict(X_test_dtm)
    score_nb+=metrics.accuracy_score(y_test, y_pred_class_nb)
    array_nb.append(metrics.accuracy_score(y_test, y_pred_class_nb))
    
    logreg = LogisticRegression(class_weight=weight_dict)
    logreg.fit(X_train_dtm, y_train)
    y_pred_class_lr = logreg.predict(X_test_dtm)
    score_lr+=metrics.accuracy_score(y_test, y_pred_class_lr)
    array_lr.append(metrics.accuracy_score(y_test, y_pred_class_lr))
    
    i=i+1



    

2214
2214
2214


In [7]:

print(score_nb)
print(score_lr)
print(score_svc)
print(i)
print(array_nb)
print(array_lr)
print(array_svc)

1.05962059621
1.10840108401
0
3
[0.32971996386630531, 0.4065040650406504, 0.32339656729900634]
[0.34869015356820232, 0.35953026196928634, 0.40018066847335138]
[]


In [9]:
accuracy_nb=score_nb/i
accuracy_lr=score_lr/i
print(accuracy_nb)
print(accuracy_lr)


0.353206865402
0.369467028004


In [None]:
#clf = pipe.make_pipeline(TfidfVectorizer(), MultinomialNB())
#clf.fit_tansform(X,y)
#score=cross_val_score(clf, X, y, cv=10, scoring='accuracy')
#print(score)

In [316]:
#create the vectorized instance
vect = CountVectorizer()

In [317]:
#fit--learn the train vocabulary, as "features";transform is to create a "document term matrix"
X_train_dtm = vect.fit_transform(X_train)
X_train_dtm

<2490x137273 sparse matrix of type '<class 'numpy.int64'>'
	with 4129644 stored elements in Compressed Sparse Row format>

In [318]:
#do the same thing to test file 
X_test_dtm = vect.transform(X_test)
X_test_dtm 

<831x137273 sparse matrix of type '<class 'numpy.int64'>'
	with 1290716 stored elements in Compressed Sparse Row format>

In [319]:
#use naive bayes model to test
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [320]:
#accuracy using cross-validation
y_pred_class_nb = nb.predict(X_test_dtm)
metrics.accuracy_score(y_test, y_pred_class_nb)

0.58604091456077012

In [None]:
#the first 10 lines of  prediction
print(y_pred_class_nb[:10])

In [None]:
metrics.confusion_matrix(y_test, y_pred_class_nb)
#class 2&7 1&4 are easy to get confused

In [None]:
#first 10 lines of classification prediction probality 
y_pred_prob = nb.predict_proba(X_test_dtm)[:10, 0]
y_pred_prob

In [321]:
#use logistic regression model to test
logreg = LogisticRegression(class_weight=weight_dict)
logreg.fit(X_train_dtm, y_train)

LogisticRegression(C=1.0,
          class_weight={7: 0.2878284506191483, 4: 0.20718816067653276, 1: 0.17154938085170643, 2: 0.13651464814255512, 6: 0.083056478405315617, 5: 0.073089700996677748, 3: 0.026880096647538508, 9: 0.01117487163998792, 8: 0.0057384475989127152},
          dual=False, fit_intercept=True, intercept_scaling=1,
          max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2',
          random_state=None, solver='liblinear', tol=0.0001, verbose=0,
          warm_start=False)

In [322]:
y_pred_class = logreg.predict(X_test_dtm)

In [None]:
#predicted probiblity of each class for the first 10 lines
y_pred_prob_lr = logreg.predict_proba(X_test_dtm)[:10]
print(y_pred_prob_lr)

In [323]:
#the accuracy using log_regression model
metrics.accuracy_score(y_test, y_pred_class)

0.63297232250300839

In [324]:
#the inaccuray mainly happened between class 1&4, 2&7
metrics.confusion_matrix(y_test, y_pred_class)

array([[ 82,   3,   0,  36,   6,   7,   8,   0,   0],
       [  5,  57,   0,   4,   0,   4,  47,   0,   0],
       [  1,   0,   1,  10,   1,   0,  14,   0,   0],
       [ 24,   1,   1, 125,  12,   2,   5,   0,   0],
       [ 15,   0,   0,   9,  10,   5,  11,   0,   0],
       [  7,   5,   1,   2,   3,  40,   3,   0,   0],
       [  5,  31,   0,   5,   5,   2, 202,   0,   0],
       [  0,   0,   0,   0,   0,   0,   2,   3,   0],
       [  0,   0,   0,   2,   0,   0,   1,   0,   6]])

In [None]:
svc=SVC(class_weight=weight_dict)
svc.fit(X_train_dtm,y_train)

In [None]:
y_pred_class_svc=svc.predict(X_test_dtm)

In [None]:
#the accuary of svc model is the lowest
metrics.accuracy_score(y_test, y_pred_class_svc)

In [None]:
#use KNN model to test

In [None]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train_dtm, y_train)

In [None]:
y_pred_class_knn= knn.predict(X_test_dtm)

In [None]:
#get the accuracy using knn model
metrics.accuracy_score(y_test, y_pred_class_knn)