Importing libraries

In [1]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, SimpleRNN, GRU
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

Loading training and test data

In [2]:
%cd ../../

Success


In [3]:
path = 'kaggle/input/'
comp = 'jigsaw-toxic-comment-classification-challenge/'
clean_data_path = 'clean_data/'
TRAIN_DATA_FILE=f'{path}{comp}train.csv.zip'
TEST_DATA_FILE=f'{path}{comp}test.csv.zip'
CLEAN_TRAIN_DATA_FILE=f'{clean_data_path}data_train_cleaned_vanilla2.txt'
CLEAN_TEST_DATA_FILE=f'{clean_data_path}data_test_cleaned_vanilla2.txt'
SAMPLE_SUBMISSION=f'{path}{comp}sample_submission.csv.zip'
checkpoint_path = 'model_checkpoint/'
submission_path = 'kaggle/working/rnn_submission/'

In [4]:
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)

def read_from_file(filename):
    with open(filename, 'r') as f:
        return f.read().splitlines()

list_sentences_train = read_from_file(CLEAN_TRAIN_DATA_FILE)
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = read_from_file(CLEAN_TEST_DATA_FILE)

Tokenization

In [5]:
embed_size = 128 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a comment to use

In [6]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

Create a validation set

In [7]:
# from sklearn.model_selection import train_test_split
# X_train, X_val, y_train, y_val = train_test_split(X_t, y, test_size=0.1)

Create model

In [8]:
from keras.metrics import AUC

def get_model(layertype='RNN', use_dropout=False, dropout_rate=0.1):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    if layertype == 'RNN' and not use_dropout:
        x = Bidirectional(SimpleRNN(50, return_sequences=True))(x)
    elif layertype == 'LSTM' and not use_dropout:
        x = Bidirectional(LSTM(50, return_sequences=True))(x)
    elif layertype == 'GRU' and not use_dropout:
        x = Bidirectional(GRU(50, return_sequences=True))(x)
    elif layertype == 'RNN' and use_dropout:
        x = Bidirectional(SimpleRNN(50, return_sequences=True, dropout=dropout_rate, recurrent_dropout=dropout_rate))(x)
    elif layertype == 'LSTM' and use_dropout:
        x = Bidirectional(LSTM(50, return_sequences=True, dropout=dropout_rate, recurrent_dropout=dropout_rate))(x)
    elif layertype == 'GRU' and use_dropout:
        x = Bidirectional(GRU(50, return_sequences=True, dropout=dropout_rate, recurrent_dropout=dropout_rate))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(dropout_rate)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(dropout_rate)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=[AUC(name='auc')])
    return model

#### Modelcheckpoint

Use val_auc to monitor when not submitting

In [9]:
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint

# class CustomCallback(Callback):
#     def on_train_begin(self, logs={}):
#         self.losses = []
#         self.aucs = []

#     def on_batch_end(self, batch, logs={}):
#         if batch % 500 == 0:
#             loss, auc = self.model.evaluate(X_val, y_val, verbose=0)
#             self.losses.append(loss)
#             self.aucs.append(auc)
#             print(f'\nEvaluation at batch {batch}: Loss = {loss}, AUC = {auc}\n')

# # Instantiate the custom callback
# custom_callback = CustomCallback()
rnn_checkpoint = ModelCheckpoint(checkpoint_path + 'rnn.keras', monitor='val_auc', mode='max', save_best_only=True, verbose=1)
lstm_checkpoint = ModelCheckpoint(checkpoint_path + 'lstm.keras', monitor='val_auc', mode='max', save_best_only=True, verbose=1)
gru_checkpoint = ModelCheckpoint(checkpoint_path + 'gru.keras', monitor='val_auc', mode='max', save_best_only=True, verbose=1)
# early_stopping = EarlyStopping(monitor='loss', min_delta=0.0005, restore_best_weights=True)   

In [10]:
# import matplotlib.pyplot as plt

# def plot_loss(callback):
#     plt.figure(figsize=(12,6))
#     plt.plot(callback.losses)
#     plt.title('model loss')
#     plt.ylabel('loss')
#     plt.xlabel('500 batch')
#     plt.legend(['train', 'test'], loc='upper left')
#     plt.show()
    
# def plot_auc(callback):
#     plt.figure(figsize=(12,6))
#     plt.plot(callback.aucs)
#     plt.title('model AUC')
#     plt.ylabel('AUC')
#     plt.xlabel('500 batch')
#     plt.legend(['train', 'test'], loc='upper left')
#     plt.show()

# def plot_loss_auc(history):
#     plot_loss(history)
#     plot_auc(history)

Build RNN model

In [21]:
model = get_model('RNN', True, 0.1)

In [22]:
model.summary()

In [23]:
batch_size = 64
epochs = 2
history = model.fit(X_t, y, batch_size=batch_size, epochs=epochs, callbacks=[rnn_checkpoint], validation_split=0.1)

Epoch 1/2
[1m2494/2494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 24ms/step - auc: 0.8734 - loss: 0.1144
Epoch 2/2
[1m   7/2494[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m52s[0m 21ms/step - auc: 0.9657 - loss: 0.0578

  self._save_model(epoch=epoch, batch=None, logs=logs)


[1m2494/2494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 26ms/step - auc: 0.9802 - loss: 0.0501


In [14]:
model.load_weights(checkpoint_path + 'rnn.keras')
sample_submission = pd.read_csv(SAMPLE_SUBMISSION)
y_test = model.predict([X_te], batch_size=1024, verbose=1)
sample_submission[list_classes] = y_test
sample_submission.to_csv(submission_path + 'rnn_dropout_submission.csv', index=False)

[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 62ms/step


Build Lstm model

In [15]:
del model
model = get_model('LSTM', True, 0.1)
model.summary()

In [16]:
history = model.fit(X_t, y, batch_size=batch_size, epochs=epochs, callbacks=[lstm_checkpoint], validation_split=0.1)

Epoch 1/2
[1m2243/2244[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 54ms/step - auc: 0.9084 - loss: 0.1006
Epoch 1: val_auc improved from -inf to 0.97499, saving model to model_checkpoint/lstm.keras
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m132s[0m 57ms/step - auc: 0.9084 - loss: 0.1006 - val_auc: 0.9750 - val_loss: 0.0507
Epoch 2/2
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step - auc: 0.9834 - loss: 0.0454
Epoch 2: val_auc improved from 0.97499 to 0.97963, saving model to model_checkpoint/lstm.keras
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m169s[0m 75ms/step - auc: 0.9834 - loss: 0.0454 - val_auc: 0.9796 - val_loss: 0.0478


In [17]:
model.load_weights(checkpoint_path + 'lstm.keras')
sample_submission = pd.read_csv(SAMPLE_SUBMISSION)
y_test = model.predict([X_te], batch_size=1024, verbose=1)
sample_submission[list_classes] = y_test
sample_submission.to_csv(submission_path + 'lstm_dropout_submission.csv', index=False)

[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 172ms/step


Build Gru model

In [18]:
del model
model = get_model('GRU', True, 0.1)
model.summary()

In [19]:
history = model.fit(X_t, y, batch_size=batch_size, epochs=epochs, callbacks=[gru_checkpoint], validation_split=0.1)

Epoch 1/2
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - auc: 0.9129 - loss: 0.0978
Epoch 1: val_auc improved from -inf to 0.97890, saving model to model_checkpoint/gru.keras
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 62ms/step - auc: 0.9129 - loss: 0.0978 - val_auc: 0.9789 - val_loss: 0.0497
Epoch 2/2
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - auc: 0.9840 - loss: 0.0457
Epoch 2: val_auc improved from 0.97890 to 0.98113, saving model to model_checkpoint/gru.keras
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 61ms/step - auc: 0.9840 - loss: 0.0457 - val_auc: 0.9811 - val_loss: 0.0475


In [20]:
model.load_weights(checkpoint_path + 'gru.keras')
sample_submission = pd.read_csv(SAMPLE_SUBMISSION)
y_test = model.predict([X_te], batch_size=1024, verbose=1)
sample_submission[list_classes] = y_test
sample_submission.to_csv(submission_path +  'gru_dropout_submission.csv', index=False)

[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 170ms/step
