In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics



In [3]:
#using pandas to import the original training_text and traning_variant
sms = pd.read_table('training_text', header=None, names=['cli_data'])
sms2= pd.read_table('training_variants', header=None,delimiter=',', names=['gene','mutation','classification'])

In [4]:
sms.shape

(3321, 1)

In [5]:
#a glimpse of the type, doesn't matter
type(sms)

pandas.core.frame.DataFrame

In [6]:
sms2.shape

(3321, 3)

In [7]:
#the first 10 lines of the text file
sms.head(10)

Unnamed: 0,cli_data
0,0||Cyclin-dependent kinases (CDKs) regulate a ...
1,1|| Abstract Background Non-small cell lung c...
2,2|| Abstract Background Non-small cell lung c...
3,3||Recent evidence has demonstrated that acqui...
4,4||Oncogenic mutations in the monomeric Casita...
5,5||Oncogenic mutations in the monomeric Casita...
6,6||Oncogenic mutations in the monomeric Casita...
7,7||CBL is a negative regulator of activated re...
8,8|| Abstract Juvenile myelomonocytic leukemia ...
9,9|| Abstract Juvenile myelomonocytic leukemia ...


In [8]:
#first 10 lines of the variants file
sms2.head(10)

Unnamed: 0,gene,mutation,classification
0,FAM58A,Truncating Mutations,1
1,CBL,W802*,2
2,CBL,Q249E,2
3,CBL,N454D,3
4,CBL,L399V,4
5,CBL,V391I,4
6,CBL,V430M,5
7,CBL,Deletion,1
8,CBL,Y371H,4
9,CBL,C384R,4


In [9]:
#to see the number of each class in variant, doesn't matter
sms2.classification.value_counts()

7    953
4    686
1    568
2    452
6    275
5    242
3     89
9     37
8     19
Name: classification, dtype: int64

In [10]:
#define the X and y to be feed to the model
X = sms.cli_data
y = sms2.classification
print(X.shape)
print(y.shape)

(3321,)
(3321,)


In [11]:
#split the training data into "train" and "test" when run the model
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(2490,)
(831,)
(2490,)
(831,)


In [12]:
#create the vectorized instance
vect = CountVectorizer()

In [13]:
#fit--learn the train vocabulary, as "features";transform is to create a "document term matrix"
X_train_dtm = vect.fit_transform(X_train)
X_train_dtm

<2490x118452 sparse matrix of type '<class 'numpy.int64'>'
	with 3737377 stored elements in Compressed Sparse Row format>

In [14]:
#do the same thing to test file 
X_test_dtm = vect.transform(X_test)
X_test_dtm 

<831x118452 sparse matrix of type '<class 'numpy.int64'>'
	with 1204606 stored elements in Compressed Sparse Row format>

In [15]:
#use naive bayes model to test
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [29]:
#accuracy
y_pred_class_nb = nb.predict(X_test_dtm)
metrics.accuracy_score(y_test, y_pred_class_nb)

0.58844765342960292

In [17]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[ 88,   9,   0,  12,  18,   5,  10,   0,   0],
       [  4,  59,   0,   5,   1,   1,  47,   0,   0],
       [  4,   1,  10,   2,   3,   1,   6,   0,   0],
       [ 57,   1,   1,  86,  16,   3,   5,   0,   1],
       [  9,   2,   0,   3,  24,   3,   8,   0,   1],
       [ 10,   3,   1,   0,   6,  36,   5,   0,   0],
       [  6,  58,   3,   1,   4,   1, 177,   0,   0],
       [  1,   1,   0,   0,   0,   0,   1,   1,   1],
       [  0,   0,   0,   0,   0,   0,   1,   0,   8]])

In [21]:
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

array([  0.00000000e+000,   0.00000000e+000,   0.00000000e+000,
         6.60548934e-095,   0.00000000e+000,   1.27072817e-094,
         0.00000000e+000,   1.00000000e+000,   0.00000000e+000,
         0.00000000e+000,   3.19557943e-007,   0.00000000e+000,
         0.00000000e+000,   2.48625739e-039,   0.00000000e+000,
         0.00000000e+000,   0.00000000e+000,   1.00000000e+000,
         1.02078755e-150,   8.00694260e-222,   2.23828874e-091,
         3.62100539e-074,   0.00000000e+000,   0.00000000e+000,
         0.00000000e+000,   0.00000000e+000,   6.11464884e-001,
         1.20962243e-111,   0.00000000e+000,   0.00000000e+000,
         1.11142998e-217,   0.00000000e+000,   0.00000000e+000,
         0.00000000e+000,   0.00000000e+000,   5.42273348e-205,
         0.00000000e+000,   8.14726214e-074,   1.00000000e+000,
         1.00000000e+000,   5.56764140e-294,   8.87809204e-134,
         0.00000000e+000,   0.00000000e+000,   0.00000000e+000,
         1.02005087e-146,   1.00000000e+

In [22]:
#use logistic regression model to test
logreg = LogisticRegression()
logreg.fit(X_train_dtm, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [30]:
y_pred_class = logreg.predict(X_test_dtm)

In [31]:
#the accuracy using log_regression model
metrics.accuracy_score(y_test, y_pred_class)

0.62334536702767751

In [None]:
#use KNN model to test

In [26]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train_dtm, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

In [32]:
y_pred_class_knn= knn.predict(X_test_dtm)

In [33]:
#get the accuracy using knn model
metrics.accuracy_score(y_test, y_pred_class_knn)

0.5703971119133574