Importing libraries

In [1]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, SimpleRNN, GRU
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

Loading training and test data

In [2]:
# quick and dirty way to change the current working directory to root (/toxic-comment-classification)
# you should run this at least once just to be certain
from os import chdir, path, getcwd
if getcwd().endswith("src"):
    chdir(path.pardir)
if path.isfile("checkcwd"):
    print("Success")
else:
    raise Exception("Something went wrong. cwd=" + getcwd())
root_path = os.getcwd()

Success


In [3]:
path = 'kaggle/input/'
comp = 'jigsaw-toxic-comment-classification-challenge/'
clean_data_path = 'clean_data/'
TRAIN_DATA_FILE=f'{path}{comp}train.csv.zip'
TEST_DATA_FILE=f'{path}{comp}test.csv.zip'
CLEAN_TRAIN_DATA_FILE=f'{clean_data_path}data_train_cleaned_vanilla.txt'
CLEAN_TEST_DATA_FILE=f'{clean_data_path}data_test_cleaned_vanilla.txt'
SAMPLE_SUBMISSION=f'{path}{comp}sample_submission.csv.zip'

In [4]:
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)

def read_from_file(filename):
    with open(filename, 'r') as f:
        return f.read().splitlines()

list_sentences_train = read_from_file(CLEAN_TRAIN_DATA_FILE)
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = read_from_file(CLEAN_TEST_DATA_FILE)

Tokenization

In [5]:
embed_size = 128 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a comment to use

In [6]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

Create a validation set

In [7]:
# from sklearn.model_selection import train_test_split
# X_train, X_val, y_train, y_val = train_test_split(X_t, y, test_size=0.1)

Training and evaluation

In [7]:
def get_model(layertype='RNN', use_dropout=False, dropout_rate=0.1):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    if layertype == 'RNN' and not use_dropout:
        x = Bidirectional(SimpleRNN(50, return_sequences=True))(x)
    elif layertype == 'LSTM' and not use_dropout:
        x = Bidirectional(LSTM(50, return_sequences=True))(x)
    elif layertype == 'GRU' and not use_dropout:
        x = Bidirectional(GRU(50, return_sequences=True))(x)
    elif layertype == 'RNN' and use_dropout:
        x = Bidirectional(SimpleRNN(50, return_sequences=True, dropout=dropout_rate, recurrent_dropout=dropout_rate))(x)
    elif layertype == 'LSTM' and use_dropout:
        x = Bidirectional(LSTM(50, return_sequences=True, dropout=dropout_rate, recurrent_dropout=dropout_rate))(x)
    elif layertype == 'GRU' and use_dropout:
        x = Bidirectional(GRU(50, return_sequences=True, dropout=dropout_rate, recurrent_dropout=dropout_rate))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(dropout_rate)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(dropout_rate)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['AUC'])
    return model

In [9]:
from keras.callbacks import Callback, ModelCheckpoint

# class CustomCallback(Callback):
#     def on_train_begin(self, logs={}):
#         self.losses = []
#         self.aucs = []

#     def on_batch_end(self, batch, logs={}):
#         if batch % 500 == 0:
#             loss, auc = self.model.evaluate(X_val, y_val, verbose=0)
#             self.losses.append(loss)
#             self.aucs.append(auc)
#             print(f'\nEvaluation at batch {batch}: Loss = {loss}, AUC = {auc}\n')

# # Instantiate the custom callback
# custom_callback = CustomCallback()
file_path = "best_model.keras"
checkpoint = ModelCheckpoint(file_path, monitor='loss', verbose=1, save_best_only=True, mode='min')

In [10]:
import matplotlib.pyplot as plt

def plot_loss(callback):
    plt.figure(figsize=(12,6))
    plt.plot(callback.losses)
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('500 batch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    
def plot_auc(callback):
    plt.figure(figsize=(12,6))
    plt.plot(callback.aucs)
    plt.title('model AUC')
    plt.ylabel('AUC')
    plt.xlabel('500 batch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()

def plot_loss_auc(history):
    plot_loss(history)
    plot_auc(history)

Build RNN model

In [11]:
model = get_model('RNN', True, 0.1)

In [12]:
model.summary()

In [13]:
batch_size = 32
epochs = 2
history = model.fit(X_t, y, batch_size=batch_size, epochs=epochs, callbacks=[checkpoint], verbose=1)

Epoch 1/2
[1m4986/4987[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 26ms/step - AUC: 0.8815 - loss: 0.1062
Epoch 1: loss improved from inf to 0.07295, saving model to best_model.keras
[1m4987/4987[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 26ms/step - AUC: 0.8815 - loss: 0.1062
Epoch 2/2
[1m4985/4987[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 24ms/step - AUC: 0.9819 - loss: 0.0483
Epoch 2: loss improved from 0.07295 to 0.04889, saving model to best_model.keras
[1m4987/4987[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 24ms/step - AUC: 0.9819 - loss: 0.0483


In [18]:
model.load_weights(file_path)
sample_submission = pd.read_csv(SAMPLE_SUBMISSION)
y_test = model.predict([X_te], batch_size=1024, verbose=1)
sample_submission[list_classes] = y_test
sample_submission.to_csv(root_path + '/kaggle/working/' + 'rnn_dropout_submission.csv', index=False)

[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 52ms/step


Build Lstm model

In [19]:
del model
model = get_model('LSTM', True, 0.1)
model.summary()

In [20]:
model.fit(X_t,y, batch_size=batch_size, epochs=epochs, callbacks=[checkpoint], verbose=1)

Epoch 1/2
[1m4987/4987[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - AUC: 0.9258 - loss: 0.0869
Epoch 1: loss did not improve from 0.04889
[1m4987/4987[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 30ms/step - AUC: 0.9258 - loss: 0.0869
Epoch 2/2
[1m4986/4987[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 32ms/step - AUC: 0.9842 - loss: 0.0458
Epoch 2: loss improved from 0.04889 to 0.04469, saving model to best_model.keras
[1m4987/4987[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 32ms/step - AUC: 0.9842 - loss: 0.0458


<keras.src.callbacks.history.History at 0x26603290c90>

In [21]:
model.load_weights(file_path)
sample_submission = pd.read_csv(SAMPLE_SUBMISSION)
y_test = model.predict([X_te], batch_size=1024, verbose=1)
sample_submission[list_classes] = y_test
sample_submission.to_csv(root_path + '/kaggle/working/' + 'lstm_dropout_submission.csv', index=False)

[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 146ms/step


Build Gru model

In [22]:
del model
model = get_model('GRU', True, 0.1)
model.summary()

In [23]:
model.fit(X_t,y, batch_size=batch_size, epochs=epochs, callbacks=[checkpoint], verbose=1)

Epoch 1/2
[1m4987/4987[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - AUC: 0.9333 - loss: 0.0827
Epoch 1: loss did not improve from 0.04469
[1m4987/4987[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m228s[0m 45ms/step - AUC: 0.9333 - loss: 0.0827
Epoch 2/2
[1m4986/4987[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 55ms/step - AUC: 0.9845 - loss: 0.0466
Epoch 2: loss did not improve from 0.04469
[1m4987/4987[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m276s[0m 55ms/step - AUC: 0.9845 - loss: 0.0466


<keras.src.callbacks.history.History at 0x266058c9690>

In [24]:
model.load_weights(file_path)
sample_submission = pd.read_csv(SAMPLE_SUBMISSION)
y_test = model.predict([X_te], batch_size=1024, verbose=1)
sample_submission[list_classes] = y_test
sample_submission.to_csv(root_path + '/kaggle/working/' + 'gru_dropout_submission.csv', index=False)

ValueError: A total of 3 objects could not be loaded. Example error message for object <GRUCell name=gru_cell, built=True>:

The shape of the target variable and the shape of the target value in `variable.assign(value)` must match. variable.shape=(128, 150), Received: value.shape=(128, 200). Target variable: <KerasVariable shape=(128, 150), dtype=float32, path=bidirectional_2/backward_gru/gru_cell/kernel>

List of objects that could not be loaded:
[<GRUCell name=gru_cell, built=True>, <GRUCell name=gru_cell, built=True>, <keras.src.optimizers.adam.Adam object at 0x000002660354D250>]

#### Todo

RNN
LSTM
GRU