
# Imports

In [55]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, \
HashingVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline, make_union
from sklearn.base import BaseEstimator, TransformerMixin

from nltk.corpus import stopwords

from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense, Input, Dropout

In [142]:
# Trainging data
train_var = pd.read_csv('training_variants.zip')
train_txt = pd.read_csv('training_text.zip', sep="\|\|", engine="python", skiprows=1, names=["ID", "Text"])

# Test set 1 of 2
test_var = pd.read_csv('test_variants.zip')
test_txt = pd.read_csv('test_text.zip', sep="\|\|", engine="python", skiprows=1, names=["ID", "Text"])
df_test_target = pd.read_csv('stage1_solution_filtered.csv')

# Test set 2 of 2
test2_var = pd.read_csv('stage2_test_variants.csv')
test2_txt = pd.read_csv('stage2_test_text.csv', sep="\|\|", engine="python", skiprows=1, names=["ID", "Text"])
df_test2_target = pd.read_csv('stage_2_private_solution.csv')

# Preprocessing


### Train data

Joining variant and text training dataframes, drop null rows, and then split into X and y.

In [20]:
df = train_var.merge(train_txt, on='ID')

In [21]:
df.dropna(inplace=True)

In [22]:
X = df[['Gene', 'Variation', 'Text']]

In [23]:
y = df['Class']

Train test split

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.33, random_state=42)

### Test data set 1 of 2

EDA


In [25]:
# Merge Test text and Test var dataframes
df_test = test_var.merge(test_txt, on='ID')

In [26]:
# Drop null columns
df_test.dropna(inplace=True)

In [66]:
# Merge target with X
df_test_reduced = df_test_target.merge(df_test, on='ID')

In [29]:
print(df_test_reduced.shape)
df_test_reduced.head()

(368, 13)


Unnamed: 0,ID,class1,class2,class3,class4,class5,class6,class7,class8,class9,Gene,Variation,Text
0,12,1,0,0,0,0,0,0,0,0,TET2,Y1902A,TET proteins oxidize 5-methylcytosine (5mC) on...
1,19,0,1,0,0,0,0,0,0,0,MTOR,D2512H,Genes encoding components of the PI3K-Akt-mTOR...
2,21,0,1,0,0,0,0,0,0,0,KIT,D52N,Myeloproliferative disorders (MPD) constitute ...
3,55,0,0,0,1,0,0,0,0,0,SPOP,F125V,"In the largest E3 ligase subfamily, Cul3 binds..."
4,64,0,0,0,1,0,0,0,0,0,KEAP1,C23Y,Keap1 is the substrate recognition module of a...


In [30]:
# Changing column names to numbers only so that they can be made into a single target column for model.
df_test_reduced.rename(columns={'class1': "1", 'class2': "2", 'class3': "3", 'class4': "4", 'class5': "5", 'class6': '6', 'class7': "7", 'class8': "8", 'class9': "9"}, inplace=True)
df_test_reduced.head()

Unnamed: 0,ID,1,2,3,4,5,6,7,8,9,Gene,Variation,Text
0,12,1,0,0,0,0,0,0,0,0,TET2,Y1902A,TET proteins oxidize 5-methylcytosine (5mC) on...
1,19,0,1,0,0,0,0,0,0,0,MTOR,D2512H,Genes encoding components of the PI3K-Akt-mTOR...
2,21,0,1,0,0,0,0,0,0,0,KIT,D52N,Myeloproliferative disorders (MPD) constitute ...
3,55,0,0,0,1,0,0,0,0,0,SPOP,F125V,"In the largest E3 ligase subfamily, Cul3 binds..."
4,64,0,0,0,1,0,0,0,0,0,KEAP1,C23Y,Keap1 is the substrate recognition module of a...


In [78]:
# Creating y
test_set_y = df_test_reduced.drop(['ID','Gene','Variation','Text'], axis=1).idxmax(axis=1)
print(test_set_y.shape)
print(test2_set_y.value_counts())

(368,)
2    53
8    32
4    20
1     5
7     5
9     4
5     3
3     2
6     1
dtype: int64


In [32]:
# Creating X
test_set_X = df_test_reduced.drop(['ID','1','2','3','4','5','6','7','8','9'], axis=1)
print(test_set_X.shape)
test_set_X.head()

(368, 3)


Unnamed: 0,Gene,Variation,Text
0,TET2,Y1902A,TET proteins oxidize 5-methylcytosine (5mC) on...
1,MTOR,D2512H,Genes encoding components of the PI3K-Akt-mTOR...
2,KIT,D52N,Myeloproliferative disorders (MPD) constitute ...
3,SPOP,F125V,"In the largest E3 ligase subfamily, Cul3 binds..."
4,KEAP1,C23Y,Keap1 is the substrate recognition module of a...


### Test data set 2 of 2

EDA on second test set


In [144]:
# Merge Test text and Test var dataframes
df_test2 = test2_var.merge(test2_txt, on='ID')

In [147]:
df_test2.head()

Unnamed: 0,ID,Gene,Variation,Text
0,1,CHEK2,H371Y,The incidence of breast cancer is increasing i...
1,2,AXIN2,Truncating Mutations,An unselected series of 310 colorectal carcino...
2,3,WNT4,E216G,Mycosis fungoides and Sézary syndrome are prim...
3,4,SUCLA2,G118R,Regulated progression through the cell cycle ...
4,5,BRAF,T599insTT,Pilocytic astrocytoma (PA) is emerging as a tu...


In [128]:
# No null values
df_test2.isnull().sum()

ID           0
Gene         0
Variation    0
Text         0
dtype: int64

In [148]:
# df_test2_wo_ID will be used to get Kaggle submission
df_test2_wo_ID = df_test2.drop(['ID'], axis=1)

In [130]:
# Merge target with X
df_test2_with_target = df_test2_target.merge(df_test2, on='ID')

In [134]:
# The rows in df_test2_with_target drop to 125, as opposed to the 986 in the df_test2 due to them only releasing the target for the cases that were manually assigned and not assigned by a machine. 
df_test2_with_target.shape

(125, 13)

In [69]:
print(df_test2_with_target.shape)
df_test2_with_target.head()

(125, 13)


Unnamed: 0,ID,class1,class2,class3,class4,class5,class6,class7,class8,class9,Gene,Variation,Text
0,8,0,0,0,1,0,0,0,0,0,RNF6,G244D,Human ESCCs 2 occur frequently worldwide (1) ....
1,15,0,0,0,0,0,0,0,0,1,ERBB2,G746S,The protein-kinase family is the most frequent...
2,16,0,0,0,0,0,0,0,1,0,TP53,Y234S,Among the best-studied therapeutic targets in ...
3,18,0,1,0,0,0,0,0,0,0,EGFR,P546S,Head and neck squamous cell carcinoma (HNSCC) ...
4,19,0,1,0,0,0,0,0,0,0,ERBB2,G279E,Functional characterization of cancer-associat...


In [73]:
# Changing column names to numbers only so that they can be made into a single target column for model.
df_test2_with_target.rename(columns={'class1': "1", 'class2': "2", 'class3': "3", 'class4': "4", 'class5': "5", 'class6': '6', 'class7': "7", 'class8': "8", 'class9': "9"}, inplace=True)
df_test2_with_target.head()

Unnamed: 0,ID,1,2,3,4,5,6,7,8,9,Gene,Variation,Text
0,8,0,0,0,1,0,0,0,0,0,RNF6,G244D,Human ESCCs 2 occur frequently worldwide (1) ....
1,15,0,0,0,0,0,0,0,0,1,ERBB2,G746S,The protein-kinase family is the most frequent...
2,16,0,0,0,0,0,0,0,1,0,TP53,Y234S,Among the best-studied therapeutic targets in ...
3,18,0,1,0,0,0,0,0,0,0,EGFR,P546S,Head and neck squamous cell carcinoma (HNSCC) ...
4,19,0,1,0,0,0,0,0,0,0,ERBB2,G279E,Functional characterization of cancer-associat...


In [79]:
# Creating y
test2_set_y = df_test2_with_target.drop(['ID','Gene','Variation','Text'], axis=1).idxmax(axis=1)
print(test2_set_y.shape)
print(test2_set_y.value_counts())

(125,)
2    53
8    32
4    20
1     5
7     5
9     4
5     3
3     2
6     1
dtype: int64


In [80]:
# Creating X
test2_set_X = df_test2_with_target.drop(['ID','1','2','3','4','5','6','7','8','9'], axis=1)
print(test2_set_X.shape)
test2_set_X.head()

(125, 3)


Unnamed: 0,Gene,Variation,Text
0,RNF6,G244D,Human ESCCs 2 occur frequently worldwide (1) ....
1,ERBB2,G746S,The protein-kinase family is the most frequent...
2,TP53,Y234S,Among the best-studied therapeutic targets in ...
3,EGFR,P546S,Head and neck squamous cell carcinoma (HNSCC) ...
4,ERBB2,G279E,Functional characterization of cancer-associat...


# Modeling


#### Defining classes

In [33]:
class CategoricalExtractor(BaseEstimator, TransformerMixin):
    """
    One-hot-encodes a categorical (string) column.
    """
    def __init__(self, column):
        self.column = column
        self.values = None
        
    def _create_values(self, indices):
        return {ind: i+1 for i, ind in enumerate(indices)}
    
    def _apply_values(self, row_val):
        return self.values.get(row_val, 0)
        
    def fit(self, X, y=None):
        self.values = self._create_values(X[self.column].value_counts().index)
        return self 
    
    def transform(self, X, y=None):
        col = X[self.column].apply(self._apply_values)
        return col.values.reshape(-1, 1)

## Feed Forward Neral Net with Keras

In [44]:
# Setting up pipelines

gene_pipe = make_pipeline(
CategoricalExtractor('Gene'),
OneHotEncoder(sparse=False, handle_unknown='ignore')
)

var_pipe = make_pipeline(
CategoricalExtractor('Variation'),
OneHotEncoder(sparse=False, handle_unknown='ignore')
)

text_pipe_k = make_pipeline(
FunctionTransformer(lambda df: df['Text'], validate=False), 
CountVectorizer(stop_words='english'), # There were 41028 features output from the CountVectorizer for the training set.
TruncatedSVD(n_components=5000) 
)

fu = make_union(text_pipe_k, gene_pipe, var_pipe)

In [45]:
# Function to create model, required for KerasClassifier
def create_model(input_dim=30):
    # create model
    model = Sequential()
    model.add(Dense(100, input_dim=input_dim, activation='relu'))
    model.add(Dropout(0.5, seed=42)) # 50% dropout with 42 random state
    model.add(Dense(9, activation='softmax')) # Activation function for the final output layer 
                                              # needs to be softmax to accomidate the nine 
                                              # different classes.

    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [46]:
# Allows the number of features to be input automatically.
X_train_ap = fu.fit_transform(X_train) 

In [47]:
model_keras = KerasClassifier(build_fn=create_model, input_dim=X_train_ap.shape[1], epochs=20, batch_size=10, verbose=1)

In [48]:
model_keras.fit(X_train_ap, y_train.values) # .values: pd df to numpy matrix

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f1cfd0ad470>

---
#### Score of test of test train split

In [54]:
# percent likelyhood for each target
model_keras.predict_proba(fu.transform(X_test))



In [50]:
model_keras.score(fu.transform(X_test), y_test)



0.62043796099015391

---
#### Test set 1 of 2 score

In [51]:
# percent likelyhood for each target
model_keras.predict_proba(fu.transform(test_set_X))



array([[  2.74480164e-01,   7.70533644e-03,   1.50055215e-02, ...,
          1.81569681e-02,   7.29050674e-03,   1.60380993e-02],
       [  3.20837971e-05,   2.14422983e-03,   1.81607783e-01, ...,
          8.16051602e-01,   2.06294104e-07,   1.05418136e-08],
       [  5.65940900e-05,   1.07384279e-01,   1.44713209e-04, ...,
          8.91611099e-01,   5.65205411e-08,   6.46995986e-08],
       ..., 
       [  3.70602311e-06,   6.81689755e-07,   1.83542193e-09, ...,
          3.56124691e-03,   4.96119042e-13,   2.97155522e-10],
       [  9.27962661e-01,   3.20937388e-10,   9.94537135e-13, ...,
          1.18111210e-08,   2.05919603e-18,   1.38246585e-14],
       [  6.74188195e-05,   1.58621907e-10,   3.78650583e-10, ...,
          1.37839451e-09,   1.49683295e-16,   3.10283852e-12]], dtype=float32)

In [52]:
model_keras.score(fu.transform(test_set_X), test_set_y.values.astype(int))



0.6277173895226873

---
#### Test set 2 of 2 score 
#### I got a score of 2.57476 on Kaggle, which would have put me in top 27th percentile of submissions with the top score being 2.03026

Part I: Getting a Kaggle submission

In [149]:
# percent likelyhood for each target
pred_test2 = model_keras.predict_proba(fu.transform(df_test2_wo_ID))

# merging the id column with predicted outputs
pred_test2 = df_test2[['ID']].merge(pd.DataFrame(pred_test2), left_index=True, right_index=True)
pred_test2.rename(columns={0: "class1", 1: "class2", 2: "class3", 3: "class4", 4: "class5", 5: 'class6', 6: "class7", 7: "class8", 8: "class9"}, inplace=True)

# making id the index
pred_test2.index = pred_test2["ID"]

#removing repetitive id column
pred_test2.drop(['ID'], axis=1, inplace=True)

print(pred_test2.shape)
pred_test2.head()

(986, 9)


Unnamed: 0_level_0,class1,class2,class3,class4,class5,class6,class7,class8,class9
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0.927428,0.001128,4.416669e-08,0.016553,0.005171,0.006269,0.043451,1.255197e-09,1.514747e-09
2,0.819531,0.004508,6.535748e-06,0.069764,0.087924,0.013109,0.005157,3.803157e-08,1.115752e-08
3,0.064044,0.404088,0.003162329,0.035775,0.092211,0.04517,0.354692,0.0006945976,0.0001624007
4,0.000386,0.433296,5.450063e-05,0.000153,0.000307,0.021203,0.544259,4.240455e-05,0.00029963
5,0.014029,0.036471,0.002506539,0.000416,0.007476,0.000663,0.938367,3.011443e-05,4.214666e-05


In [150]:
# Exporting to CSV to submit to Kaggle
pred_test2.to_csv('pred_test2.csv')

Part 2: Getting an R^2 score

Score on a subset of the test set 2 of 2 (They only released the targets that were manually assigned and not machine generated.)

In [104]:
model_keras.score(fu.transform(test2_set_X), test2_set_y.values.astype(int))



0.13600000202655793

---
Baseline accuracy based on traning data set

In [53]:
y.value_counts().max()/y.value_counts().sum()

0.28696175850647393