https://www.kaggle.com/jhoward/improved-lstm-baseline-glove-dropout

In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import time
import os
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import constants as ct
from kaggletoxicity.keras_utils import ExtraValMetric, KaggleToxicityValMetric
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

# from subprocess import check_output
# print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import GRU, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

max_features = 20000 #20000
maxlen = 500 # 200

train = pd.read_csv(os.path.join(ct.DATA_FOLDER, 'train.csv'))
test = pd.read_csv(os.path.join(ct.DATA_FOLDER, 'test.csv'))
# train = train.sample(frac=0.05, random_state=0)

In [None]:
list_sentences_train = train["comment_text"].fillna("CVxTz").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("CVxTz").values

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = sequence.pad_sequences(list_tokenized_train, padding='pre', maxlen=maxlen)
X_te = sequence.pad_sequences(list_tokenized_test, padding='pre', maxlen=maxlen)

In [None]:
def get_bidirectional_model(embed_size,
                            input_shape,
                            n_neurons,
                            dropout_rate=0.1,
                            opt_alg='nadam'):
    inp = Input(shape=(input_shape,))
    x = Embedding(max_features, embed_size)(inp)
    x = Bidirectional(GRU(n_neurons, return_sequences=True))(x) # recurrent_dropout=0.1
    x = GlobalMaxPool1D()(x)
    x = Dropout(dropout_rate)(x)
    x = Dense(n_neurons, activation="relu")(x)
    x = Dropout(dropout_rate)(x)
    x = Dense(6, activation="sigmoid")(x)

    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer=opt_alg)

    return model

In [None]:
embedding_size = 128

model = get_bidirectional_model(embed_size=embedding_size,
                                input_shape=maxlen,
                                n_neurons=50, #50
                                dropout_rate=0.1)

batch_size = 1024
epochs = 50
val_prop = 0.2
es_patience = 5
rlr_patience = 2
rlr_cooldown = 4

file_path = os.path.join(ct.MODELS_FOLDER, "weights_base_best.hdf5")
extraval = KaggleToxicityValMetric()
early_stop = EarlyStopping(monitor='val_roc_auc', patience=es_patience, mode='max',  verbose=0)
checkpoint = ModelCheckpoint(file_path, monitor='val_roc_auc', verbose=0, mode='max',   save_best_only=True)
reduce_lr = ReduceLROnPlateau( monitor='val_roc_auc', 
                              factor=0.5, 
                              patience=rlr_patience, 
                              cooldown=rlr_cooldown, 
                              min_lr=1e-4)

callbacks_list = [extraval, checkpoint, early_stop, reduce_lr]
model.fit(X_t, y, batch_size=batch_size, epochs=epochs, validation_split=val_prop, callbacks=callbacks_list)

In [None]:
model.load_weights(file_path)

y_test = model.predict(X_te)

In [2]:
sample_submission = pd.read_csv(os.path.join(ct.DATA_FOLDER, 'sample_submission.csv'))

In [4]:
sample_submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.5,0.5,0.5,0.5,0.5,0.5
1,0000247867823ef7,0.5,0.5,0.5,0.5,0.5,0.5
2,00013b17ad220c46,0.5,0.5,0.5,0.5,0.5,0.5
3,00017563c3f7919a,0.5,0.5,0.5,0.5,0.5,0.5
4,00017695ad8997eb,0.5,0.5,0.5,0.5,0.5,0.5


In [None]:
sample_submission[list_classes] = y_test

In [10]:
moment = time.strftime("%Y_%m_%d_%H_%M")
moment

'2018_02_17_20_30'

In [11]:
file_name = 'results_%s.csv' % moment
sample_submission.to_csv(os.path.join(ct.RESULTS_FOLDER, file_name), index=False)

In [12]:
sample_submission.shape

(153164, 7)

In [34]:
results_check = pd.read_csv(os.path.join(ct.RESULTS_FOLDER, 'results_2018_02_17_20_25.csv'))
results_check.shape

(153164, 7)

In [35]:
results_check.id.eq(sample_submission.id).sum()

153164

In [36]:
sample_submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.5,0.5,0.5,0.5,0.5,0.5
1,0000247867823ef7,0.5,0.5,0.5,0.5,0.5,0.5
2,00013b17ad220c46,0.5,0.5,0.5,0.5,0.5,0.5
3,00017563c3f7919a,0.5,0.5,0.5,0.5,0.5,0.5
4,00017695ad8997eb,0.5,0.5,0.5,0.5,0.5,0.5


In [37]:
results_check.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.997685,0.2578337,0.991582,0.063447,0.913571,0.1139697
1,0000247867823ef7,8.5e-05,2.062972e-07,2.7e-05,3e-06,4e-06,9.025246e-07
2,00013b17ad220c46,0.00051,3.04314e-06,0.000113,4e-05,3.4e-05,7.326921e-06
3,00017563c3f7919a,0.000203,1.29092e-06,7e-05,1e-05,1.4e-05,3.779153e-06
4,00017695ad8997eb,2e-05,9.277559e-07,1e-05,8e-06,2e-06,3.007853e-06
