
# Imports

In [102]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, \
HashingVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline, make_union
from sklearn.base import BaseEstimator, TransformerMixin

from nltk.corpus import stopwords

from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense, Input, Dropout

In [103]:
train_var = pd.read_csv('Data/training_variants.zip')
train_txt = pd.read_csv('Data/training_text.zip', sep="\|\|", engine="python", skiprows=1, names=["ID", "Text"])
test_var = pd.read_csv('Data/test_variants.zip')
test_txt = pd.read_csv('Data/test_text.zip', sep="\|\|", engine="python", skiprows=1, names=["ID", "Text"])

# Preprocessing


### Train data

Joining variant and text training dataframes, drop null rows, and then split into X and y.

In [104]:
df = train_var.merge(train_txt, on='ID')

In [105]:
df.dropna(inplace=True)

In [106]:
X = df[['Gene', 'Variation', 'Text']]

In [107]:
y = df['Class']

Train test split

In [108]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.33, random_state=42)

### Test data

EDA on test set


In [109]:
# Merge Test text and Test var dataframes
df_test = test_var.merge(test_txt, on='ID')

In [110]:
# Drop null columns
df_test.dropna(inplace=True)

In [111]:
df_test_target = pd.read_csv('Data/stage1_solution_filtered.csv')

In [112]:
df_test_reduced = df_test_target.merge(df_test, on='ID')

In [113]:
print(df_test_reduced.shape)
df_test_reduced.head()

(367, 13)


Unnamed: 0,ID,class1,class2,class3,class4,class5,class6,class7,class8,class9,Gene,Variation,Text
0,12,1,0,0,0,0,0,0,0,0,TET2,Y1902A,TET proteins oxidize 5-methylcytosine (5mC) on...
1,19,0,1,0,0,0,0,0,0,0,MTOR,D2512H,Genes encoding components of the PI3K-Akt-mTOR...
2,21,0,1,0,0,0,0,0,0,0,KIT,D52N,Myeloproliferative disorders (MPD) constitute ...
3,55,0,0,0,1,0,0,0,0,0,SPOP,F125V,"In the largest E3 ligase subfamily, Cul3 binds..."
4,64,0,0,0,1,0,0,0,0,0,KEAP1,C23Y,Keap1 is the substrate recognition module of a...


In [114]:
df_test_reduced.rename(columns={'class1': "1", 'class2': "2", 'class3': "3", 'class4': "4", 'class5': "5", 'class6': '6', 'class7': "7", 'class8': "8", 'class9': "9"}, inplace=True)
df_test_reduced.head()

Unnamed: 0,ID,1,2,3,4,5,6,7,8,9,Gene,Variation,Text
0,12,1,0,0,0,0,0,0,0,0,TET2,Y1902A,TET proteins oxidize 5-methylcytosine (5mC) on...
1,19,0,1,0,0,0,0,0,0,0,MTOR,D2512H,Genes encoding components of the PI3K-Akt-mTOR...
2,21,0,1,0,0,0,0,0,0,0,KIT,D52N,Myeloproliferative disorders (MPD) constitute ...
3,55,0,0,0,1,0,0,0,0,0,SPOP,F125V,"In the largest E3 ligase subfamily, Cul3 binds..."
4,64,0,0,0,1,0,0,0,0,0,KEAP1,C23Y,Keap1 is the substrate recognition module of a...


In [115]:
test_set_y = df_test_reduced.drop(['ID','Gene','Variation','Text'], axis=1).idxmax(axis=1)
print(test_set_y.shape)
test_set_y.head()

(367,)


0    1
1    2
2    2
3    4
4    4
dtype: object

In [116]:
test_set_X = df_test_reduced.drop(['ID','1','2','3','4','5','6','7','8','9'], axis=1)
print(test_set_X.shape)
test_set_X.head()

(367, 3)


Unnamed: 0,Gene,Variation,Text
0,TET2,Y1902A,TET proteins oxidize 5-methylcytosine (5mC) on...
1,MTOR,D2512H,Genes encoding components of the PI3K-Akt-mTOR...
2,KIT,D52N,Myeloproliferative disorders (MPD) constitute ...
3,SPOP,F125V,"In the largest E3 ligase subfamily, Cul3 binds..."
4,KEAP1,C23Y,Keap1 is the substrate recognition module of a...


# Modeling


#### Defining classes

In [117]:
class CategoricalExtractor(BaseEstimator, TransformerMixin):
    """
    One-hot-encodes a categorical (string) column.
    """
    def __init__(self, column):
        self.column = column
        self.values = None
        
    def _create_values(self, indices):
        return {ind: i+1 for i, ind in enumerate(indices)}
    
    def _apply_values(self, row_val):
        return self.values.get(row_val, 0)
        
    def fit(self, X, y=None):
        self.values = self._create_values(X[self.column].value_counts().index)
        return self 
    
    def transform(self, X, y=None):
        col = X[self.column].apply(self._apply_values)
        return col.values.reshape(-1, 1)

## Feed Forward Neral Net with Keras

In [118]:
# Setting up pipelines

gene_pipe = make_pipeline(
CategoricalExtractor('Gene'),
OneHotEncoder(sparse=False, handle_unknown='ignore')
)

var_pipe = make_pipeline(
CategoricalExtractor('Variation'),
OneHotEncoder(sparse=False, handle_unknown='ignore')
)

text_pipe_k = make_pipeline(
FunctionTransformer(lambda df: df['Text'], validate=False), 
CountVectorizer(stop_words='english'),
TruncatedSVD(n_components=1000) #change this to 5000 for the final run
)

fu = make_union(text_pipe_k, gene_pipe, var_pipe)

In [119]:
# Function to create model, required for KerasClassifier
def create_model(input_dim=30):
    # create model
    model = Sequential()
    model.add(Dense(100, input_dim=input_dim, activation='relu'))
    model.add(Dropout(0.5, seed=42)) # 50% dropout with 42 random state
    model.add(Dense(9, activation='softmax')) # Activation function for the final output layer 
                                                             # needs to be softmax to accomidate the nine 
                                                             # different classes.

    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [120]:
# Allows the number of features to be input automatically.
X_train_ap = fu.fit_transform(X_train[:200]) #Change to full data set for final run

In [121]:
model_keras = KerasClassifier(build_fn=create_model, input_dim=X_train_ap.shape[1], epochs=10, batch_size=10, verbose=1)

In [122]:
model_keras.fit(X_train_ap, y_train[:200].values) # .values: pd df to numpy matrix

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x15fb5edd8>

---
Score of test of test train split

In [123]:
# percent likelyhood for each target
model_keras.predict_proba(fu.transform(X_test[:200]))



array([[  7.09283575e-02,   9.38346907e-02,   8.63999128e-02, ...,
          1.38175830e-01,   5.19023091e-02,   5.57374097e-02],
       [  9.49104488e-01,   1.78958612e-10,   1.13720833e-15, ...,
          5.08947894e-02,   3.81989709e-15,   2.00518872e-11],
       [  1.52919203e-01,   1.54788882e-04,   2.34568972e-07, ...,
          4.50022370e-01,   6.28459748e-05,   1.34967926e-09],
       ..., 
       [  1.25648575e-27,   8.82177824e-22,   1.00000000e+00, ...,
          9.23321023e-19,   3.45585221e-25,   3.08149080e-27],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          1.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   3.88637486e-27,   0.00000000e+00, ...,
          3.00592511e-16,   0.00000000e+00,   0.00000000e+00]], dtype=float32)

In [124]:
model_keras.score(fu.transform(X_test[:200]), y_test[:200])



0.45000000707805154

---
Test set score

In [125]:
# percent likelyhood for each target
model_keras.predict_proba(fu.transform(test_set_X))



array([[  3.75872800e-09,   1.63378287e-11,   7.51725342e-14, ...,
          6.86953645e-05,   8.04831340e-11,   6.55075495e-17],
       [  7.66402028e-16,   1.60524871e-10,   5.29244373e-20, ...,
          1.00000000e+00,   2.89131411e-21,   5.34308310e-25],
       [  6.09350650e-07,   1.43918282e-08,   3.85051470e-11, ...,
          9.99999166e-01,   2.82634027e-09,   5.57696792e-13],
       ..., 
       [  1.58270846e-18,   7.96266158e-06,   2.04881303e-15, ...,
          9.99956369e-01,   6.96272456e-19,   3.35329692e-15],
       [  4.70687603e-14,   8.85654755e-27,   4.77108314e-27, ...,
          9.99823868e-01,   8.57990454e-19,   1.06183180e-25],
       [  1.25097982e-17,   4.04524877e-14,   3.66126478e-05, ...,
          8.68599376e-12,   3.20282839e-21,   7.98559081e-14]], dtype=float32)

In [138]:
model_keras.score(fu.transform(test_set_X), test_set_y.values.astype(int))



0.43324251282117671

---
Baseline accuracy based on traning data set

In [101]:
y.value_counts().max()/y.value_counts().sum()

0.28709288299155611