# Improved LSTM baseline

This kernel is a somewhat improved version of [Keras - Bidirectional LSTM baseline](https://www.kaggle.com/CVxTz/keras-bidirectional-lstm-baseline-lb-0-051) along with some additional documentation of the steps. (NB: this notebook has been re-run on the new test set.)

In [33]:
import sys, os, re, csv, codecs, time, numpy as np, pandas as pd

from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from kaggletoxicity.keras_utils import KaggleToxicityValMetric
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
import constants as ct
from tqdm import tqdm

In [71]:
# the initial block is copied from creating_word_vectors_with_word2vec.ipynb
import nltk
from nltk import word_tokenize, sent_tokenize
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
import pandas as pd
from bokeh.io import output_notebook, output_file
from bokeh.plotting import show, figure

# new!
import string
from nltk.corpus import stopwords
from nltk.stem.porter import *
from gensim.models.phrases import Phraser, Phrases
from keras.preprocessing.text import one_hot
from gensim.models import FastText

%matplotlib inline

In [7]:
TRAIN_DATA_FILE = os.path.join(ct.DATA_FOLDER, 'train.csv')
TEST_DATA_FILE = os.path.join(ct.DATA_FOLDER, 'test.csv')

In [35]:
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)

list_sentences_train = train["comment_text"].fillna("_na_").values.tolist()
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values.tolist()
list_sentences_test = test["comment_text"].fillna("_na_").values

## Preprocessing

In [36]:
embed_size = 50 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 500 # max number of words in a comment to use

In [37]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/infinitemonkeys/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/infinitemonkeys/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [38]:
STOPWORDS = stopwords.words('english') + list(string.punctuation)

In [63]:
def tokenized_text_preprocesser(text, stem=False, stopwords=True):
    
    stemmer = PorterStemmer()
    
    if stem and stopwords:
        text = [stemmer.stem(w.lower()) for w in text if w not in STOPWORDS]
    elif stem and (not stopwords):
        text = [stemmer.stem(w.lower()) for w in text]
    elif (not stem) and stopwords:
        text = [w.lower() for w in text if w not in STOPWORDS]
    
    return text

### Tokenizamos

In [42]:
list_sentences_train_tokenized = []

for text in tqdm(list_sentences_train):
    list_sentences_train_tokenized.append(word_tokenize(text))

100%|██████████| 159571/159571 [01:23<00:00, 1912.36it/s]


In [43]:
list_sentences_test_tokenized = []

for text in tqdm(list_sentences_test):
    list_sentences_test_tokenized.append(word_tokenize(text))

100%|██████████| 153164/153164 [01:15<00:00, 2040.88it/s]


In [44]:
lower_sents_train = []
for s in tqdm(list_sentences_train_tokenized):
    lower_sents_train.append([w.lower() for w in s if w not in list(string.punctuation)])

100%|██████████| 159571/159571 [00:12<00:00, 13127.61it/s]


In [45]:
lower_sents_test = []
for s in tqdm(list_sentences_test_tokenized):
    lower_sents_test.append([w.lower() for w in s if w not in list(string.punctuation)])

100%|██████████| 153164/153164 [00:10<00:00, 15143.35it/s]


In [47]:
lower_bigram_train = Phraser(Phrases(lower_sents_train, min_count=32, threshold=64))
lower_bigram_test = Phraser(Phrases(lower_sents_test, min_count=32, threshold=64))

In [48]:
clean_sents_train = []
for s in tqdm(lower_sents_train):
    clean_sents_train.append(lower_bigram_train[s])

100%|██████████| 159571/159571 [00:28<00:00, 5665.06it/s]


In [49]:
clean_sents_test = []
for s in tqdm(lower_sents_test):
    clean_sents_test.append(lower_bigram_test[s])

100%|██████████| 153164/153164 [00:23<00:00, 6454.31it/s]


In [51]:
list_sentences_train_tokenized[0]

['Explanation',
 'Why',
 'the',
 'edits',
 'made',
 'under',
 'my',
 'username',
 'Hardcore',
 'Metallica',
 'Fan',
 'were',
 'reverted',
 '?',
 'They',
 'were',
 "n't",
 'vandalisms',
 ',',
 'just',
 'closure',
 'on',
 'some',
 'GAs',
 'after',
 'I',
 'voted',
 'at',
 'New',
 'York',
 'Dolls',
 'FAC',
 '.',
 'And',
 'please',
 'do',
 "n't",
 'remove',
 'the',
 'template',
 'from',
 'the',
 'talk',
 'page',
 'since',
 'I',
 "'m",
 'retired',
 'now.89.205.38.27']

In [50]:
clean_sents_train[0]

['explanation',
 'why',
 'the',
 'edits',
 'made',
 'under',
 'my',
 'username',
 'hardcore',
 'metallica',
 'fan',
 'were',
 'reverted',
 'they',
 'were',
 "n't",
 'vandalisms',
 'just',
 'closure',
 'on',
 'some',
 'gas',
 'after',
 'i',
 'voted',
 'at',
 'new_york',
 'dolls',
 'fac',
 'and',
 'please',
 'do',
 "n't",
 'remove',
 'the',
 'template',
 'from',
 'the',
 'talk',
 'page',
 'since',
 'i',
 "'m",
 'retired',
 'now.89.205.38.27']

In [64]:
clean_sents_train_stop = [tokenized_text_preprocesser(text, stem=False, stopwords=True) for text in clean_sents_train]
clean_sents_test_stop = [tokenized_text_preprocesser(text, stem=False, stopwords=True) for text in clean_sents_test]

In [67]:
# clean_sents_train_stop_stem = [tokenized_text_preprocesser(text, stem=True, stopwords=True) for text in clean_sents_train]
# clean_sents_test_stop_stem = [tokenized_text_preprocesser(text, stem=True, stopwords=True) for text in clean_sents_test]

In [69]:
# clean_sents_train_stem = [tokenized_text_preprocesser(text, stem=True, stopwords=False) for text in clean_sents_train]
# clean_sents_test_stem = [tokenized_text_preprocesser(text, stem=True, stopwords=False) for text in clean_sents_test]

In [70]:
train_df = pd.DataFrame({'comment_text': clean_sents_train}, index=train.index)
test_df = pd.DataFrame({'comment_text': clean_sents_test}, index=test.index)

train_stop_df = pd.DataFrame({'comment_text': clean_sents_train_stop}, index=train.index)
test_stop_df = pd.DataFrame({'comment_text': clean_sents_test_stop}, index=test.index)

# train_stop_stem_df = pd.DataFrame({'comment_text': clean_sents_train_stop_stem}, index=train.index)
# test_stop_stem_df = pd.DataFrame({'comment_text': clean_sents_test_stop_stem}, index=test.index)

# train_stem_df = pd.DataFrame({'comment_text': clean_sents_train_stem}, index=train.index)
# test_stem_df = pd.DataFrame({'comment_text': clean_sents_test_stem}, index=test.index)

In [77]:
# train_df.to_csv(os.path.join(ct.DATA_FOLDER, 'processed_train_df.csv' ), index=False)
train_df.to_hdf(ct.STORE_PATH, 'processed_train_df')
test_df.to_hdf(ct.STORE_PATH, 'processed_testn_df')
train_stop_df.to_hdf(ct.STORE_PATH, 'processed_train_stop_df')
test_stop_df.to_hdf(ct.STORE_PATH, 'processed_test_stop_df')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->['comment_text']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


### Generamos los *embeddings*
https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/FastText_Tutorial.ipynb

In [78]:
emb = FastText(clean_sents_train + clean_sents_test, size=64,
               window=10, min_count=10, workers=8, sg=1, seed=0)

In [79]:
emb_stop = FastText(clean_sents_train_stop + clean_sents_test_stop, size=64, window=10, min_count=10, workers=8,sg=1, seed=0)

In [81]:
emb.save(os.path.join(ct.DATA_TOOLS_FOLDER, 'emb.w2v'))
emb_stop.save(os.path.join(ct.DATA_TOOLS_FOLDER, 'emb_stop.w2v'))

- https://codekansas.github.io/blog/2016/gensim.html
- http://adventuresinmachinelearning.com/gensim-word2vec-tutorial/

## Algorithm

In [82]:
from keras.layers import Embedding
from keras.engine import Input

def word2vec_embedding_layer(embeddings_path):
    weights = np.load(open(embeddings_path, 'rb'))
    layer = Embedding(input_dim=weights.shape[0], output_dim=weights.shape[1], weights=[weights])
    return layer

In [86]:
embedding_layer = word2vec_embedding_layer(os.path.join(ct.DATA_TOOLS_FOLDER, 'emb.w2v.wv.vectors_ngrams.npy'))
embedding_stop_layer = word2vec_embedding_layer(os.path.join(ct.DATA_TOOLS_FOLDER, 'emb_stop.w2v.wv.vectors_ngrams.npy'))

Set some basic config parameters:

Read in our data and replace missing values:

Standard keras preprocessing, to turn each comment into a list of word indexes of equal length (with truncation or padding as needed).

### Normal

In [126]:
def convert_data_to_index(string_data, wv):
    index_data = []
    for word in string_data:
        if word in wv.vocab:
            index_data.append(wv.vocab[word].index)
    return index_data

In [129]:
X_t = pad_sequences([convert_data_to_index(w, emb.wv) for w in clean_sents_train], maxlen=maxlen)
X_te = pad_sequences([convert_data_to_index(w, emb.wv) for w in clean_sents_test], maxlen=maxlen)

In [131]:
inp = Input(shape=(maxlen,))
x = embedding_layer(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam')

Now we're ready to fit out model! Use `validation_split` when not submitting.

In [132]:
X_t.shape

(159571, 500)

In [133]:
batch_size = 1024
epochs = 50
val_prop = 0.05
es_patience = 5
rlr_patience = 2
rlr_cooldown = 4

file_path = os.path.join(ct.MODELS_FOLDER, "weights_base_best_fast_text_emb.hdf5")
extraval = KaggleToxicityValMetric()
early_stop = EarlyStopping(monitor='val_roc_auc', patience=es_patience, mode='max',  verbose=0)
checkpoint = ModelCheckpoint(file_path, monitor='val_roc_auc', verbose=0, mode='max',   save_best_only=True)
reduce_lr = ReduceLROnPlateau( monitor='val_roc_auc', 
                              factor=0.5, 
                              patience=rlr_patience, 
                              cooldown=rlr_cooldown, 
                              min_lr=1e-4)

callbacks_list = [extraval, checkpoint, early_stop, reduce_lr]
model.fit(X_t, np.array(y), batch_size=batch_size, epochs=epochs, validation_split=val_prop, callbacks=callbacks_list)

Train on 151592 samples, validate on 7979 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50


<keras.callbacks.History at 0x7f52f240c6d8>

And finally, get predictions for the test set and prepare a submission CSV:

In [134]:
model.load_weights(file_path)
y_test = model.predict([X_te], batch_size=1024, verbose=1)



In [135]:
sample_submission = pd.read_csv(os.path.join(ct.DATA_FOLDER, 'sample_submission.csv'))

In [136]:
sample_submission[list_classes] = y_test

In [137]:
moment = time.strftime("%Y_%m_%d_%H_%M")
moment

'2018_02_18_06_46'

In [138]:
file_name = 'results_fasttext_%s.csv' % moment
sample_submission.to_csv(os.path.join(ct.RESULTS_FOLDER, file_name), index=False)

### Without stopwords

In [139]:
X_t = pad_sequences([convert_data_to_index(w, emb_stop.wv) for w in clean_sents_train], maxlen=maxlen)
X_te = pad_sequences([convert_data_to_index(w, emb_stop.wv) for w in clean_sents_test], maxlen=maxlen)

In [140]:
inp = Input(shape=(maxlen,))
x = embedding_stop_layer(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam')

In [141]:
batch_size = 1024
epochs = 50
val_prop = 0.05
es_patience = 5
rlr_patience = 2
rlr_cooldown = 4

file_path = os.path.join(ct.MODELS_FOLDER, "weights_base_best_fast_text_emb_stop.hdf5")
extraval = KaggleToxicityValMetric()
early_stop = EarlyStopping(monitor='val_roc_auc', patience=es_patience, mode='max',  verbose=0)
checkpoint = ModelCheckpoint(file_path, monitor='val_roc_auc', verbose=0, mode='max',   save_best_only=True)
reduce_lr = ReduceLROnPlateau( monitor='val_roc_auc', 
                              factor=0.5, 
                              patience=rlr_patience, 
                              cooldown=rlr_cooldown, 
                              min_lr=1e-4)

callbacks_list = [extraval, checkpoint, early_stop, reduce_lr]
model.fit(X_t, np.array(y), batch_size=batch_size, epochs=epochs, validation_split=val_prop, callbacks=callbacks_list)

Train on 151592 samples, validate on 7979 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50


<keras.callbacks.History at 0x7f52f5447e48>

In [142]:
model.load_weights(file_path)
y_test = model.predict([X_te], batch_size=1024, verbose=1)



In [143]:
sample_submission = pd.read_csv(os.path.join(ct.DATA_FOLDER, 'sample_submission.csv'))

In [144]:
sample_submission[list_classes] = y_test

In [145]:
moment = time.strftime("%Y_%m_%d_%H_%M")
moment

'2018_02_18_07_27'

In [146]:
file_name = 'results_fasttext_stop_%s.csv' % moment
sample_submission.to_csv(os.path.join(ct.RESULTS_FOLDER, file_name), index=False)