# This notebook is heavily inspired by: *https://www.kaggle.com/jhoward/improved-lstm-baseline-glove-dropout-lb-0-048*

In [49]:
import sys 
import os
import re
import codecs

import numpy as np
import pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from keras import initializers, regularizers, constraints, optimizers, layers

In [10]:
#basic config parameters
embed_size = 50
max_features = 20000
maxlen = 100

In [35]:
#Read data and replace missing values

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train = train.fillna("_na_")
test = test.fillna("_na_")
train["processed"] = text_preprocessing(train["comment_text"])
test["processed"] = text_preprocessing(test["comment_text"])

list_sentences_train = train["processed"].values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes]
list_sentences_test = test["proceessec"].values

Start preprocessing: 
0 of 159571
yehaa we making progress
10000 of 159571
20000 of 159571
30000 of 159571
40000 of 159571
50000 of 159571
60000 of 159571
70000 of 159571
80000 of 159571
90000 of 159571
100000 of 159571
110000 of 159571
120000 of 159571
130000 of 159571
140000 of 159571
150000 of 159571
Start preprocessing: 
0 of 153164
yehaa we making progress
10000 of 153164
20000 of 153164
30000 of 153164
40000 of 153164
50000 of 153164
60000 of 153164
70000 of 153164
80000 of 153164
90000 of 153164
100000 of 153164
110000 of 153164
120000 of 153164
130000 of 153164
140000 of 153164
150000 of 153164


KeyError: 'proceessec'

In [52]:
list_sentences_train2 = train["comment_text"].values
list_sentences_test2 = test["comment_text"].values

In [36]:
def text_preprocessing(text):
    from nltk.corpus import stopwords
    from nltk.tokenize import RegexpTokenizer
    from nltk.stem import WordNetLemmatizer
    
    tokenizer = RegexpTokenizer(r'\w+')
    stop = set(stopwords.words('english')) 
    lemmatizer = WordNetLemmatizer()
     
    processed = []   
    print("Start preprocessing: ")
    for i in range(len(text)):
        if i%10000 == 0:
            print("{} of {}".format(i, len(text)))
        temp = tokenizer.tokenize(text.iloc[i])
        temp = [lemmatizer.lemmatize(i) for i in temp if i not in stop]
        temp = (" ").join(temp)
        processed.append(temp)
        
    return processed

<p>Standard Keras preprocessing, each comment is turned into a list of words indexes of equal length (including truncation/padding)</p>

In [37]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [54]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train2))
list_tokenized_train2 = tokenizer.texts_to_sequences(list_sentences_train2)
list_tokenized_test2 = tokenizer.texts_to_sequences(list_sentences_test2)
X_t2 = pad_sequences(list_tokenized_train2, maxlen=maxlen)
X_te2 = pad_sequences(list_tokenized_test2, maxlen=maxlen)

In [38]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(0,1,(nb_words, embed_size))

<p>Simple bidirectional LSTM with two fully connected layers</p>

In [64]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.15, recurrent_dropout=0.15))(x)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.15, recurrent_dropout=0.15))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(6, activation='sigmoid')(x)
model2 = Model(inputs=inp, outputs=x)
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [59]:
filepath = r"C:\Users\Wignand\Desktop\python\ML#\ToxicCommentClassifier"

callbacks = [
    EarlyStopping(monitor = 'binary_crossentropy', min_delta=0, patience=1, verbose=10, mode='auto'),
    ModelCheckpoint(filepath=filepath, monitor = 'binary_crossentropy', verbose=1, save_best_only=True),
    TensorBoard(log_dir=filepath)
]

model.fit(X_t, y, batch_size=32, epochs=10, callbacks=callbacks, validation_split=0.1)

Train on 143613 samples, validate on 15958 samples
Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x220cb1c668>

In [None]:
filepath = r"C:\Users\Wignand\Desktop\python\ML#\ToxicCommentClassifier"

callbacks = [
    EarlyStopping(monitor = 'binary_crossentropy', min_delta=0, patience=1, verbose=10, mode='auto'),
    ModelCheckpoint(filepath=filepath, monitor = 'binary_crossentropy', verbose=1, save_best_only=True),
    TensorBoard(log_dir=filepath)
]

model2.fit(X_t2, y, batch_size=32, epochs=10, callbacks=callbacks, validation_split=0.1)

Train on 143613 samples, validate on 15958 samples
Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10

In [61]:
y_test = model.predict([X_te], batch_size=1024, verbose=1)
#y_test2 = model2.predict([X_te], batch_size=1024, verbose=1)
sample_submission = pd.read_csv("sample_submission.csv")
sample_submission[list_classes] = y_test
sample_submission.to_csv("submission_LSTM3.csv", index=False)
#sample_submission2 = pd.read_csv("sample_submission.csv")
#sample_submission2[list_classes] = y_test2
#sample_submission2.to_csv("submission_LSTM4.csv", index=False)

