# This notebook is heavily inspired by: *https://www.kaggle.com/jhoward/improved-lstm-baseline-glove-dropout-lb-0-048*

In [25]:
import sys 
import os
import re
import codecs

import numpy as np
import pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D, GRU
from keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from keras import initializers, regularizers, constraints, optimizers, layers

In [31]:
#basic config parameters
embed_size = 50
max_features = 20000
maxlen = 100

In [26]:
#Read data and replace missing values

train = pd.read_csv("train.csv")
train_de = pd.read_csv("train_de.csv") 
train_fr = pd.read_csv("train_fr.csv")
train_es = pd.read_csv("train_es.csv")
test = pd.read_csv("test.csv")

train = train.append(train_de, ignore_index=True)
train = train.append(train_fr, ignore_index=True)
train = train.append(train_es, ignore_index=True)

train = train.fillna("_na_")
test = test.fillna("_na_")

list_sentences_train = train["comment_text"].values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes]
list_sentences_test = test["comment_text"].values

def translation(df, dest, column="comment_text" ):
    """
    functions gets one df and translates 
    the specified column into the destined language and back
    returns extended dataframe
    """
    print("Start translation...")
    translator = Translator()
    temp = []
    for i in range(len(df)): 
        if i%10000 == 0:
            print("Translating entry {} of {} entries.".format(i, len(df)))
        intermediate = translator.translate(df[column].iloc[i], src='en', dest=dest)
        temp.append(translator.translate(str(intermediate), src=dest, dest='en'))
        
    return pd.Series(temp)
    
def build_and_append(df, list_classes, to_append):
    """
    function gets the df to which to append, the list of classes
    and the translated Series which shall be appended
    """
    
    to_append=pd.DataFrame(to_append)
    for cl in list_classes:
        to_append[cl] = df[cl].copy()
        
    df = df.append(to_append, ignore_index=True)

from googletrans import Translator
x = translation(train, 'ja')

<p>Standard Keras preprocessing, each comment is turned into a list of words indexes of equal length (including truncation/padding)</p>

In [32]:
tokenizer = Tokenizer(num_words=max_features, filters='"()+,.;=[\\]^`{}\t\n',)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [34]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(0.5, 0.25, (nb_words, embed_size))

<p>Simple bidirectional LSTM with two fully connected layers</p>

In [36]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(GRU(100, return_sequences=True, dropout=0.3, recurrent_dropout=0.3))(x)
x = Dropout(0.2)(x)
x = Bidirectional(GRU(50, return_sequences=True, dropout=0.15, recurrent_dropout=0.15))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(6, activation='sigmoid')(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
filepath = r"C:\WM\ToxicComments"
save = r"C:\WM\ToxicComments\save.hd5f"
tb = r"C:\WM\ToxicComments\tensorboard"

callbacks = [
    EarlyStopping(monitor = 'loss', min_delta=0, patience=1, verbose=10, mode='min'),
    ModelCheckpoint(filepath=save, monitor = 'loss', verbose=1, save_best_only=True),
    TensorBoard(log_dir=tb, histogram_freq=1)
]

model.fit(X_t, y, batch_size=250, epochs=20, callbacks=callbacks, validation_split=0.1)

Train on 574455 samples, validate on 63829 samples
Epoch 1/20
Epoch 00001: loss improved from inf to 0.08497, saving model to C:\WM\ToxicComments\save.hd5f
Epoch 2/20
Epoch 00002: loss improved from 0.08497 to 0.05264, saving model to C:\WM\ToxicComments\save.hd5f
Epoch 3/20
Epoch 00003: loss improved from 0.05264 to 0.04851, saving model to C:\WM\ToxicComments\save.hd5f
Epoch 4/20

In [38]:
y_test = model.predict([X_te], batch_size=1024, verbose=1)
sample_submission = pd.read_csv("sample_submission.csv")
sample_submission[list_classes] = y_test
sample_submission.to_csv("submission_LSTM5.csv", index=False)

