
# Imports

In [2]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, \
HashingVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline, make_union
from sklearn.base import BaseEstimator, TransformerMixin

from nltk.corpus import stopwords

from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense, Input, Dropout

Using TensorFlow backend.


In [6]:
train_var = pd.read_csv('Data/training_variants.zip')
train_txt = pd.read_csv('Data/training_text.zip', sep="\|\|", engine="python", skiprows=1, names=["ID", "Text"])
test_var = pd.read_csv('Data/test_variants.zip')
test_txt = pd.read_csv('Data/test_text.zip', sep="\|\|", engine="python", skiprows=1, names=["ID", "Text"])

# Preprocessing


### Train data

Joining variant and text training dataframes, drop null rows, and then split into X and y.

In [9]:
df = train_var.merge(train_txt, on='ID')

In [10]:
df.dropna(inplace=True)

In [11]:
X = df[['Gene', 'Variation', 'Text']]

In [12]:
y = df['Class']

Train test split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.33, random_state=42)

### Test data

EDA on test set


In [None]:
# Merge Test text and Test var dataframes
df_test = test_var.merge(test_txt, on='ID')

In [None]:
# Drop null columns
df_test.dropna(inplace=True)

In [None]:
df_test_target = pd.read_csv('Data/stage1_solution_filtered.csv')

In [None]:
df_test_reduced = df_test_target.merge(df_test, on='ID')

In [None]:
print(df_test_reduced.shape)
df_test_reduced.head()

In [None]:
df_test_reduced.rename(columns={'class1': "1", 'class2': "2", 'class3': "3", 'class4': "4", 'class5': "5", 'class6': '6', 'class7': "7", 'class8': "8", 'class9': "9"}, inplace=True)
df_test_reduced.head()

In [None]:
test_set_y = df_test_reduced.drop(['ID','Gene','Variation','Text'], axis=1).idxmax(axis=1)
print(test_set_y.shape)
test_set_y.head()

In [None]:
test_set_X = df_test_reduced.drop(['ID','1','2','3','4','5','6','7','8','9'], axis=1)
print(test_set_X.shape)
test_set_X.head()

# Modeling


#### Defining classes

In [17]:
class CategoricalExtractor(BaseEstimator, TransformerMixin):
    """
    One-hot-encodes a categorical (string) column.
    """
    def __init__(self, column):
        self.column = column
        self.values = None
        
    def _create_values(self, indices):
        return {ind: i+1 for i, ind in enumerate(indices)}
    
    def _apply_values(self, row_val):
        return self.values.get(row_val, 0)
        
    def fit(self, X, y=None):
        self.values = self._create_values(X[self.column].value_counts().index)
        return self 
    
    def transform(self, X, y=None):
        col = X[self.column].apply(self._apply_values)
        return col.values.reshape(-1, 1)

## Neral Nets with Keras

In [88]:
# Setting up pipelines

gene_pipe = make_pipeline(
CategoricalExtractor('Gene'),
OneHotEncoder(sparse=False, handle_unknown='ignore')
)

var_pipe = make_pipeline(
CategoricalExtractor('Variation'),
OneHotEncoder(sparse=False, handle_unknown='ignore')
)

text_pipe_k = make_pipeline(
FunctionTransformer(lambda df: df['Text'], validate=False), 
CountVectorizer(stop_words='english'),
TruncatedSVD(n_components=5000)
)

fu = make_union(text_pipe_k, gene_pipe, var_pipe)

In [89]:
# Function to create model, required for KerasClassifier
def create_model(input_dim=30):
    # create model
    model = Sequential()
    model.add(Dense(100, input_dim=input_dim, activation='relu'))
    model.add(Dropout(0.5, seed=42)) # 50% dropout with 42 random state
    model.add(Dense(9, activation='softmax')) # Activation function for the final output layer 
                                                             # needs to be softmax to accomidate the nine 
                                                             # different classes.

    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
# Allows the number of features to be input automatically.
X_train_ap = fu.fit_transform(X_train)

In [None]:
model_keras = KerasClassifier(build_fn=create_model, input_dim=X_train_ap.shape[1], epochs=10, batch_size=10, verbose=1)

In [None]:
model_keras.fit(X_train_ap, y_train.values) # .values: pd df to numpy matrix

---
Score of test of test train split

In [80]:
# percent likelyhood for each target
model_keras.predict_proba(fu.transform(X_test))



array([[  1.59491614e-01,   2.19694793e-01,   5.76639809e-02, ...,
          1.25669599e-01,   2.95009073e-02,   8.57762843e-02],
       [  2.23048404e-02,   1.77942496e-03,   7.50496387e-01, ...,
          1.23288624e-01,   3.65310395e-03,   5.86872120e-05],
       [  9.32567709e-05,   1.82996877e-02,   1.65729113e-02, ...,
          9.58231330e-01,   1.29706677e-04,   5.35025247e-05],
       ..., 
       [  2.31455397e-02,   5.56576587e-02,   1.42178787e-05, ...,
          9.10925865e-01,   6.26178226e-03,   3.79859994e-05],
       [  3.11045554e-08,   6.62461556e-16,   4.79166290e-11, ...,
          2.34269646e-16,   4.88801662e-18,   4.04895186e-27],
       [  4.17267710e-01,   1.83150098e-02,   1.79649413e-01, ...,
          5.88812083e-02,   4.82242496e-04,   6.10550679e-03]], dtype=float32)

In [81]:
model_keras.score(fu.transform(X_test), y_test)



0.59086758333798417

---
Test set score

In [None]:
# percent likelyhood for each target
model_keras.predict_proba(fu.transform(test_set_X))

In [None]:
model_keras.score(fu.transform(test_set_X), test_set_y)

---
Baseline accuracy based on traning data set

In [101]:
y.value_counts().max()/y.value_counts().sum()

0.28709288299155611