In [1]:
import pandas as pd
import numpy as np
import keras
import gensim
import pickle
import gc
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
%matplotlib inline

%load_ext autoreload
%autoreload 2
import helper

Using TensorFlow backend.


In [2]:
X_train = np.load("data/word_vectors/pubmed_wiki_X_train_seeds42.npy")
y_train = np.load("data/word_vectors/pubmed_wiki_y_train_seeds42.npy")

In [3]:
X_test = np.load("data/word_vectors/pubmed_wiki_X_test_seeds42.npy")
y_test = np.load("data/word_vectors/pubmed_wiki_y_test_seeds42.npy")

In [6]:
X_train.shape

(45587, 100, 200)

In [12]:
input_shape = (100, 200)

# Basic FeedForward Net (BoW)

In [21]:
documents_train, target_train = helper.load_multiclass_data()

In [22]:
count_vect = CountVectorizer(binary=False, max_df=0.9, min_df=3, lowercase=True, strip_accents="unicode")
count_vect.fit(documents_train)
X_train_counts = count_vect.transform(documents_train)
tfidf_transformer = TfidfTransformer()
X = tfidf_transformer.fit_transform(X_train_counts)
X.shape

(45587, 33714)

In [23]:
input_dim = X.shape[1]

In [24]:
max_len = 20000
X = X[:max_len].todense()
target_train = target_train[:max_len].values
print(X.shape)
gc.collect()

(20000, 33714)


115

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, target_train, test_size=0.1)
del X

In [30]:
model = keras.models.Sequential()
model.add(keras.layers.Dense(32, input_shape=(input_dim,)))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(16))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(23, activation='softmax'))

In [31]:
model.compile(optimizer="adagrad",
              loss='categorical_crossentropy',
              metrics=['categorical_accuracy'])
print(model.summary())
model.fit(X_train, y_train, validation_split=0.1, epochs=15, batch_size=256)
yhat = model.predict(X_test)
_ = helper.score_prediction(y_test, yhat, binary=False)
gc.collect()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_10 (Dense)             (None, 32)                1078880   
_________________________________________________________________
dropout_7 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 16)                528       
_________________________________________________________________
dropout_8 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 23)                391       
Total params: 1,079,799
Trainable params: 1,079,799
Non-trainable params: 0
_________________________________________________________________
None
Train on 16200 samples, validate on 1800 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
E

  'precision', 'predicted', average, warn_for)


103

# Basic RNN (LSTM)

In [29]:
model = keras.models.Sequential()
model.add(keras.layers.recurrent.LSTM(100, input_shape=input_shape))
model.add(keras.layers.Dropout(0.3))
model.add(keras.layers.Dense(23, activation='softmax'))

In [30]:
model.compile(optimizer="adagrad",
              loss='categorical_crossentropy',
              metrics=['categorical_accuracy'])
print(model.summary())
model.fit(X_train, y_train, validation_split=0.1, epochs=15, batch_size=256)
yhat = model.predict(X_test)
_ = helper.score_prediction(y_test, yhat, binary=False)
gc.collect()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_12 (LSTM)               (None, 100)               120400    
_________________________________________________________________
dropout_12 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 23)                2323      
Total params: 122,723
Trainable params: 122,723
Non-trainable params: 0
_________________________________________________________________
None
Train on 36925 samples, validate on 4103 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        36
          1       0.33      0.57      0.42       182
          2    

  'precision', 'predicted', average, warn_for)


313

# Basic RNN (GRU)

In [47]:
model = keras.models.Sequential()
model.add(keras.layers.recurrent.GRU(100, input_shape=input_shape))
model.add(keras.layers.Dropout(0.3))
model.add(keras.layers.Dense(23, activation='softmax'))

In [48]:
model.compile(optimizer="adagrad",
              loss='categorical_crossentropy',
              metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, validation_split=0.1, epochs=15, batch_size=256)
yhat = model.predict(X_test)
_ = helper.score_prediction(y_test, yhat, binary=False)
gc.collect()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_14 (GRU)                 (None, 100)               90300     
_________________________________________________________________
dropout_23 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 23)                2323      
Total params: 92,623
Trainable params: 92,623
Non-trainable params: 0
_________________________________________________________________
None
Train on 36925 samples, validate on 4103 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        36
          1       0.43      0.52      0.47       182
          2      

102

# Deep RNN (GRU)

In [45]:
model = keras.models.Sequential()
model.add(keras.layers.recurrent.GRU(64, input_shape=input_shape, return_sequences=True))
model.add(keras.layers.Dropout(0.3))
model.add(keras.layers.recurrent.GRU(32))
model.add(keras.layers.Dropout(0.3))
model.add(keras.layers.Dense(23, activation='softmax'))

In [46]:
model.compile(optimizer="adagrad",
              loss='categorical_crossentropy',
              metrics=['categorical_accuracy'])
print(model.summary())
model.fit(X_train, y_train, validation_split=0.1, epochs=15, batch_size=256)
yhat = model.predict(X_test)
_ = helper.score_prediction(y_test, yhat, binary=False)
gc.collect()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_12 (GRU)                 (None, 100, 64)           50880     
_________________________________________________________________
dropout_21 (Dropout)         (None, 100, 64)           0         
_________________________________________________________________
gru_13 (GRU)                 (None, 32)                9312      
_________________________________________________________________
dropout_22 (Dropout)         (None, 32)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 23)                759       
Total params: 60,951
Trainable params: 60,951
Non-trainable params: 0
_________________________________________________________________
None
Train on 36925 samples, validate on 4103 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8

  'precision', 'predicted', average, warn_for)


1135

# Basic CNN

In [108]:
filters = 100

model = keras.models.Sequential()
model.add(keras.layers.Conv1D(filters,
             kernel_size=1,
             padding='valid',
             activation='relu',
             strides=1, input_shape=input_shape))
model.add(keras.layers.pooling.GlobalMaxPooling1D())
model.add(keras.layers.Dropout(0.3))
model.add(keras.layers.Dense(23, activation='softmax'))

In [109]:
model.compile(optimizer="adagrad",
              loss='categorical_crossentropy',
              metrics=['categorical_accuracy'])
print(model.summary())
model.fit(X_train, y_train, validation_split=0.1, epochs=15, batch_size=256)
yhat = model.predict(X_test)
_ = helper.score_prediction(y_test, yhat, binary=False)
gc.collect()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_19 (Conv1D)           (None, 100, 100)          20100     
_________________________________________________________________
global_max_pooling1d_8 (Glob (None, 100)               0         
_________________________________________________________________
dropout_51 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_44 (Dense)             (None, 23)                2323      
Total params: 22,423
Trainable params: 22,423
Non-trainable params: 0
_________________________________________________________________
None
Train on 36925 samples, validate on 4103 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
             precision    recall  f1-score   

  'precision', 'predicted', average, warn_for)


102

# Basic Recurrent CNN

In [19]:
input_shape = (100, 200)
filters = 128
kernel_size = 3
model = keras.models.Sequential()
model.add(keras.layers.Conv1D(filters,
             kernel_size,
             padding='valid',
             activation='relu',
             strides=1, input_shape=input_shape))
model.add(keras.layers.MaxPool1D(pool_size=16))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.recurrent.GRU(64))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(23, activation='softmax'))

In [20]:
model.compile(optimizer="adagrad",
              loss='categorical_crossentropy',
              metrics=['categorical_accuracy'])
print(model.summary())
model.fit(X_train, y_train, validation_split=0.1, epochs=15, batch_size=256)
yhat = model.predict(X_test)
_ = helper.score_prediction(y_test, yhat, binary=False)
gc.collect()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_12 (Conv1D)           (None, 98, 128)           76928     
_________________________________________________________________
max_pooling1d_12 (MaxPooling (None, 6, 128)            0         
_________________________________________________________________
dropout_17 (Dropout)         (None, 6, 128)            0         
_________________________________________________________________
gru_7 (GRU)                  (None, 64)                37056     
_________________________________________________________________
dropout_18 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 23)                1495      
Total params: 115,479
Trainable params: 115,479
Non-trainable params: 0
_________________________________________________________________
None

  'precision', 'predicted', average, warn_for)


108

# Deep Recurrent CNN

In [13]:
input_shape = (100, 200)
filters = 128
model = keras.models.Sequential()
model.add(keras.layers.Conv1D(filters,
             kernel_size=2,
             padding='valid',
             activation='relu',
             strides=1, input_shape=input_shape))
model.add(keras.layers.MaxPool1D(pool_size=16))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Conv1D(filters,
             kernel_size=3,
             padding='valid',
             activation='relu',
             strides=1, input_shape=input_shape))
model.add(keras.layers.MaxPool1D(pool_size=2))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.recurrent.GRU(64))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(23, activation='softmax'))

In [14]:
model.compile(optimizer="adagrad",
              loss='categorical_crossentropy',
              metrics=['categorical_accuracy'])
print(model.summary())
model.fit(X_train, y_train, validation_split=0.1, epochs=15, batch_size=256)
yhat = model.predict(X_test)
_ = helper.score_prediction(y_test, yhat, binary=False)
gc.collect()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_8 (Conv1D)            (None, 99, 128)           51328     
_________________________________________________________________
max_pooling1d_8 (MaxPooling1 (None, 6, 128)            0         
_________________________________________________________________
dropout_9 (Dropout)          (None, 6, 128)            0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 4, 128)            49280     
_________________________________________________________________
max_pooling1d_9 (MaxPooling1 (None, 2, 128)            0         
_________________________________________________________________
dropout_10 (Dropout)         (None, 2, 128)            0         
_________________________________________________________________
gru_4 (GRU)                  (None, 64)                37056     
__________

  'precision', 'predicted', average, warn_for)


752