In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "."]).decode("utf8"))

basemodel.ipynb
comment-classification
comment-classification.zip
gru_cnn_model.ipynb
input
models
preprocess.ipynb
processed



In [34]:
from keras.models import Model, Sequential
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout, GRU, Conv1D, MaxPool1D
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers.normalization import BatchNormalization

In [3]:
train_x_processed = pd.read_csv('./processed/train_x_processed')
train_y_processed = pd.read_csv('./processed/train_y_processed')
val_x_processed = pd.read_csv('./processed/val_x_processed')
val_y_processed = pd.read_csv('./processed/val_y_processed')

In [4]:
max_features = 10000
maxlen = 200

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_x_processed.content))

train_x_token = tokenizer.texts_to_sequences(train_x_processed.content)
val_x_token = tokenizer.texts_to_sequences(val_x_processed.content)

train_X = sequence.pad_sequences(train_x_token, maxlen=maxlen)
val_X = sequence.pad_sequences(val_x_token, maxlen=maxlen)

In [46]:
from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [10]:
model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen))
model.add(Bidirectional(GRU(128, return_sequences=True, recurrent_dropout=0.1)))
model.add(Bidirectional(GRU(128, return_sequences=True, recurrent_dropout=0.1)))
model.add(Conv1D(64, kernel_size=3, kernel_initializer='glorot_uniform', padding='valid'))
model.add(GlobalMaxPool1D())
model.add(Dropout(0.2))
model.add(Dense(50, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(80, activation="sigmoid"))
model.compile('adam', 'binary_crossentropy', metrics=['accuracy', f1])

In [52]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 200, 128)          1280000   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 200, 256)          197376    
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 198, 64)           49216     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 64)                0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 50)                3250      
_________________________________________________________________
dropout_4 (Dropout)          (None, 50)                0         
__________

In [13]:
batch_size = 32
epochs = 2

model.fit(train_X, train_y_processed,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=[val_X, val_y_processed])

Train on 105000 samples, validate on 15000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x142265eb8>

In [44]:
from keras.models import load_model

In [48]:
model = load_model('./models/gru_cnn_6epochs.h5', custom_objects={'f1':f1})

In [51]:
batch_size = 32
epochs = 2

model.fit(train_X, train_y_processed,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=[val_X, val_y_processed])

Train on 105000 samples, validate on 15000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x14c064e48>

In [50]:
model.save('./models/gru_cnn_8epochs.h5')

In [18]:
val_prediction = model.predict(val_X)

In [19]:
val_prediction[1]

array([9.9687237e-01, 7.8557758e-04, 4.8008442e-04, 2.4861668e-03,
       9.9025118e-01, 1.2394754e-04, 4.1677093e-04, 9.1625769e-03,
       9.9789268e-01, 9.9889957e-04, 9.6633163e-04, 1.4306314e-03,
       8.8452309e-01, 2.6132138e-02, 3.6940087e-02, 3.4728110e-02,
       9.9391705e-01, 1.2250890e-03, 4.4313348e-03, 3.2886972e-03,
       9.9962080e-01, 1.1678530e-04, 2.7501522e-04, 2.6853738e-04,
       9.8723340e-01, 4.9626878e-03, 6.5971520e-03, 5.7990672e-03,
       1.1054547e-01, 5.5491991e-02, 2.8927964e-01, 6.1916935e-01,
       8.5887027e-01, 1.3877025e-02, 2.1185970e-02, 8.4489748e-02,
       9.9730790e-01, 6.4987963e-04, 3.5331638e-03, 1.7852376e-03,
       9.9775946e-01, 2.8790708e-03, 3.0096381e-03, 9.0758991e-04,
       9.9729532e-01, 2.1226762e-03, 3.2921514e-04, 7.2395697e-04,
       9.9449509e-01, 3.7263811e-03, 1.6867676e-03, 2.1181754e-03,
       9.9088895e-01, 1.1975194e-02, 1.9541115e-03, 1.9001622e-03,
       4.8325518e-01, 4.6592545e-02, 6.5133207e-02, 3.3793047e

In [24]:
def get_prediction(row_index, label_index):
    return val_prediction[row_index][label_index * 4 - 4: label_index * 4]

In [30]:
get_prediction(23, 1)

array([0.00528253, 0.01328158, 0.01953338, 0.98854136], dtype=float32)

In [35]:
model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen))
model.add(Bidirectional(GRU(128, return_sequences=True, recurrent_dropout=0.1)))
model.add(Conv1D(64, kernel_size=3, kernel_initializer='glorot_uniform', padding='valid'))
model.add(MaxPool1D(pool_size=3))
model.add(Conv1D(64, kernel_size=3, kernel_initializer='glorot_uniform', padding='valid'))
model.add(GlobalMaxPool1D())
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(80, activation="sigmoid"))
model.compile('adam', 'binary_crossentropy', metrics=['accuracy', f1])

In [36]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 200, 128)          1280000   
_________________________________________________________________
bidirectional_5 (Bidirection (None, 200, 256)          197376    
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 198, 64)           49216     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 66, 64)            0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 64, 64)            12352     
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 64)                0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 64)                0         
__________

In [37]:
batch_size = 32
epochs = 4

model.fit(train_X, train_y_processed,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=[val_X, val_y_processed])

Train on 105000 samples, validate on 15000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x1479e25c0>

In [38]:
model.save('./models/gru_cnn_deeper_4epochs.h5')

In [40]:
batch_size = 32
epochs = 4

model.fit(train_X, train_y_processed,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=[val_X, val_y_processed])

Train on 105000 samples, validate on 15000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x149fcc4a8>

In [41]:
model.save('./models/gru_cnn_deeper_8epochs.h5')

In [42]:
batch_size = 32
epochs = 4

model.fit(train_X, train_y_processed,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=[val_X, val_y_processed])

Train on 105000 samples, validate on 15000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x149fccf28>

In [43]:
model.save('./models/gru_cnn_deeper_12epochs.h5')