In [16]:
#Import the packages
import numpy as np
import pandas as pd
import os
from keras.layers import Dense,Input,LSTM,Bidirectional,Activation,Conv1D,GRU
from keras.callbacks import Callback
from keras.layers import Dropout,Embedding,GlobalMaxPooling1D, MaxPooling1D, Add, Flatten
from keras.preprocessing import text, sequence
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.models import Model
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

In [17]:
#Specify the embedding file
EMBEDDING_FILE = 'glove.840B.300d.txt'

#Read the training and testing sets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [18]:
#Obtain the text and labels for training
X_train = train["comment_text"].str.lower()
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values

#Obtain the text for test data. Note that no label is given in the testing dataset
X_test = test["comment_text"].str.lower()

In [19]:
#Define the maximum number of features, maximum length of comments, and embedding size.
max_features=100000
maxlen=150
embed_size=300

In [20]:
#Define a class for model evaluation
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch+1, score))

In [21]:
#Tokenize the corpus constructed from both the training text and testing text.
tok=text.Tokenizer(num_words=max_features,lower=True)
tok.fit_on_texts(list(X_train)+list(X_test))
#Convert texts into a sequence of words
X_train=tok.texts_to_sequences(X_train)
X_test=tok.texts_to_sequences(X_test)
#Pads each sequence to the same length, i.e. 150
x_train=sequence.pad_sequences(X_train,maxlen=maxlen)
x_test=sequence.pad_sequences(X_test,maxlen=maxlen)

In [22]:
#Create the complete embedding matrix from the embedding file, each word is associated with a 300-dim vector.
embeddings_index = {}
with open(EMBEDDING_FILE,encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [23]:
word_index = tok.word_index
#Prepare embedding matrix for this dataset
num_words = min(max_features, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embed_size))
for word, i in word_index.items():
    if i >= max_features:#Ignore the words that has an index out of range
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [10]:
#Define a function to use RandomSearchCV for Keras models to tune parameters
def create_model(learn_rate = 0.01, momentum = 0, init_mode='uniform', dropout_rate=0.1, weight_constraint=0):
    sequence_input = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix],trainable = False)(sequence_input)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(128, return_sequences=True,dropout=dropout_rate,recurrent_dropout=dropout_rate))(x)
    x = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = init_mode, kernel_constraint=maxnorm(weight_constraint))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    x = concatenate([avg_pool, max_pool]) 
    preds = Dense(6, activation="sigmoid")(x)
    model = Model(sequence_input, preds)
    
    checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
    early = EarlyStopping(monitor="val_acc", mode="max", patience=5)
    ra_val = RocAucEvaluation(validation_data=(X_val, y_val), interval = 1)
    callbacks_list = [ra_val,checkpoint, early]
    #Compile model
    model.compile(loss='binary_crossentropy',optimizer=Adam(lr=learn_rate, momentum = momentum, decay=1e-6),metrics=['accuracy'])
    return model

seed = 666
numpy.random.seed(seed)

batch_size = 64
epochs = 5
X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.9, random_state=233)

#Create model
model = KerasClassifier(build_fn=create_model, verbose=0, epochs=epochs, batch_size=batch_size)

#Define the grid search parameters
learn_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
momentum = [0.0, 0.2, 0.4, 0.6, 0.8, 0.9]
init_mode = ['uniform', 'lecun_uniform', 'normal', 'zero', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform']
weight_constraint = [0, 1, 2, 3, 4, 5]
dropout_rate = [0.1, 0.2, 0.3]

param_grid = dict(learn_rate=learn_rate, momentum=momentum, init_mode=init_mode, weight_constraint=weight_constraint, dropout_rate=dropout_rate)

rand = RandomizedSearchCV(estimator=model, param_grid=param_grid, n_iter = 10, cv = 3, n_jobs=-1)



In [12]:
#Train the model
rand.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),callbacks = callbacks_list,verbose=1)
#Training themodel takes a lot of time, so I have saved the weights then load the model weights to get the results.
#The model weights can be downloaded via: https://drive.google.com/open?id=1EACiAZMv1PcQKDUwEKoWLAaMPKulrGcL
filepath="Weights.hdf5"
rand.load_weights(filepath)
print('Predicting....')
y_pred = rand.predict(x_test,batch_size=1024,verbose=1)

Train on 143613 samples, validate on 15958 samples
Epoch 1/4
 ROC-AUC - epoch: 1 - score: 0.986515
Epoch 00001: val_acc improved from -inf to 0.98279, saving model to best.hdf5
Epoch 2/4
 ROC-AUC - epoch: 2 - score: 0.988919
Epoch 00002: val_acc improved from 0.98279 to 0.98395, saving model to best.hdf5
Epoch 3/4
 ROC-AUC - epoch: 3 - score: 0.988741
Epoch 00003: val_acc did not improve
Epoch 4/4
 ROC-AUC - epoch: 4 - score: 0.988571
Epoch 00004: val_acc did not improve
Predicting....


In [13]:
#Save the predicting results
submission = pd.read_csv('sample_submission.csv')
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('submission.csv', index=False)