In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
max_features = 20000
maxlen = 100

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
subm = pd.read_csv('sample_submission.csv')
train = train.sample(frac=1)


In [3]:
list_sentences_train = train["comment_text"].fillna("CVxTz").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("CVxTz").values


In [4]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)


In [5]:
list_tokenized_train

[[1637,
  1069,
  173,
  5,
  3809,
  15,
  28,
  4,
  7,
  856,
  173,
  5,
  3809,
  15,
  1,
  868,
  3,
  1,
  257,
  17,
  100,
  20,
  2654,
  44,
  33,
  204,
  175,
  6],
 [1,
  702,
  7,
  39,
  8,
  9016,
  1,
  3652,
  702,
  17,
  10,
  6206,
  397,
  15,
  13,
  8,
  1,
  138,
  1097,
  17,
  1,
  702,
  10,
  270,
  1554,
  25,
  801,
  8638,
  4,
  22,
  6,
  209,
  35,
  112,
  1160,
  19,
  10,
  46,
  110,
  616,
  2,
  1,
  3652,
  1097,
  2,
  35,
  1,
  3652,
  702,
  8,
  298,
  1,
  138,
  10,
  891,
  17,
  1,
  250,
  7,
  311,
  11,
  56,
  16,
  7,
  13,
  8,
  14,
  1,
  113,
  84,
  8153,
  451,
  13,
  4,
  4305,
  2017,
  2,
  69,
  128,
  11,
  826,
  31,
  7,
  214,
  11,
  8,
  30,
  746,
  965,
  4,
  328,
  11,
  1737,
  15,
  90,
  1962,
  10,
  750,
  3014],
 [435,
  126,
  130,
  539,
  1167,
  571,
  625,
  182,
  127,
  6,
  12,
  1638,
  21,
  28,
  20,
  733,
  807,
  4,
  11,
  43,
  55,
  331,
  25,
  185,
  45,
  81,
  1,
  588,
  12,
  54,

In [1]:
X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

NameError: name 'sequence' is not defined

In [15]:
def get_model():
    embed_size = 128
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.1)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model


model = get_model()
batch_size = 32
epochs = 3

In [16]:
file_path="weights_base.best.hdf5"
# checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
checkpoint = ModelCheckpoint(file_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

In [17]:
# early = EarlyStopping(monitor="val_loss", mode="min", patience=20)
early = EarlyStopping(monitor="val_acc", mode="max", patience=20)

In [18]:
callbacks_list = [checkpoint, early] #early
model.fit(X_t, y, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list)
# model.fit(X_t, y, batch_size=batch_size, epochs=epochs, validation_split=0.1)

model.load_weights(file_path)
y_test = model.predict(X_te)

Train on 143613 samples, validate on 15958 samples
Epoch 1/3

Epoch 00001: val_acc improved from -inf to 0.98148, saving model to weights_base.best.hdf5
Epoch 2/3

Epoch 00002: val_acc improved from 0.98148 to 0.98256, saving model to weights_base.best.hdf5
Epoch 3/3

Epoch 00003: val_acc did not improve from 0.98256
