Importing libraries

In [1]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, SimpleRNN, GRU
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

Loading training and test data

In [2]:
%cd ../
root_path = os.getcwd()

d:\python\Toxic-comment-classification


In [3]:
path = 'kaggle/input/'
comp = 'jigsaw-toxic-comment-classification-challenge/'
clean_data_path = 'clean_data/'
TRAIN_DATA_FILE=f'{path}{comp}train.csv.zip'
TEST_DATA_FILE=f'{path}{comp}test.csv.zip'
CLEAN_TRAIN_DATA_FILE=f'{clean_data_path}data_train_cleaned_vanilla.txt'
CLEAN_TEST_DATA_FILE=f'{clean_data_path}data_test_cleaned_vanilla.txt'
SAMPLE_SUBMISSION=f'{path}{comp}sample_submission.csv.zip'

In [4]:
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)

def read_from_file(filename):
    with open(filename, 'r') as f:
        return f.read().splitlines()
    
list_sentences_train = read_from_file(CLEAN_TRAIN_DATA_FILE)
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = read_from_file(CLEAN_TEST_DATA_FILE)

Tokenization

In [5]:
embed_size = 50 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a comment to use

In [6]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

Build RNN model

In [7]:
def get_model(layertype='RNN', use_dropout=False, dropout_rate=0.1):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    if layertype == 'RNN' and not use_dropout:
        x = Bidirectional(SimpleRNN(50, return_sequences=True))(x)
    elif layertype == 'LSTM' and not use_dropout:
        x = Bidirectional(LSTM(50, return_sequences=True))(x)
    elif layertype == 'GRU' and not use_dropout:
        x = Bidirectional(GRU(50, return_sequences=True))(x)
    elif layertype == 'RNN' and use_dropout:
        x = Bidirectional(SimpleRNN(50, return_sequences=True, dropout=dropout_rate, recurrent_dropout=dropout_rate))(x)
    elif layertype == 'LSTM' and use_dropout:
        x = Bidirectional(LSTM(50, return_sequences=True, dropout=dropout_rate, recurrent_dropout=dropout_rate))(x)
    elif layertype == 'GRU' and use_dropout:
        x = Bidirectional(GRU(50, return_sequences=True, dropout=dropout_rate, recurrent_dropout=dropout_rate))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(dropout_rate)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(dropout_rate)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy', 'AUC'])
    return model

In [8]:
model = get_model('RNN', True, 0.1)

In [9]:
model.summary()

In [10]:
batch_size = 32
epochs = 2
model.fit(X_t,y, batch_size=batch_size, epochs=epochs)

Epoch 1/2
[1m4987/4987[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 29ms/step - AUC: 0.8594 - accuracy: 0.8889 - loss: 0.1185
Epoch 2/2
[1m4987/4987[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 31ms/step - AUC: 0.9792 - accuracy: 0.9935 - loss: 0.0514


<keras.src.callbacks.history.History at 0x2ed068241d0>

In [11]:
sample_submission = pd.read_csv(SAMPLE_SUBMISSION)
y_test = model.predict([X_te], batch_size=1024, verbose=1)
sample_submission[list_classes] = y_test
sample_submission.to_csv(root_path + '/kaggle/working/' + 'rnn_dropout_submission.csv', index=False)

[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 83ms/step


Build Lstm model

In [12]:
del model
model = get_model('LSTM', True, 0.1)
model.summary()

In [13]:
model.fit(X_t,y, batch_size=batch_size, epochs=epochs)

Epoch 1/2
[1m4987/4987[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 39ms/step - AUC: 0.9112 - accuracy: 0.8573 - loss: 0.0952
Epoch 2/2
[1m4987/4987[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 33ms/step - AUC: 0.9840 - accuracy: 0.9930 - loss: 0.0459


<keras.src.callbacks.history.History at 0x2ed44f6fe50>

In [14]:
sample_submission = pd.read_csv(SAMPLE_SUBMISSION)
y_test = model.predict([X_te], batch_size=1024, verbose=1)
sample_submission[list_classes] = y_test
sample_submission.to_csv(root_path + '/kaggle/working/' + 'lstm_dropout_submission.csv', index=False)

[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 97ms/step


Build Gru model

In [15]:
del model
model = get_model('GRU', True, 0.1)
model.summary()

In [16]:
model.fit(X_t,y, batch_size=batch_size, epochs=epochs)

Epoch 1/2
[1m4987/4987[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m165s[0m 33ms/step - AUC: 0.9273 - accuracy: 0.9115 - loss: 0.0875
Epoch 2/2
[1m4987/4987[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 31ms/step - AUC: 0.9855 - accuracy: 0.9830 - loss: 0.0451


<keras.src.callbacks.history.History at 0x2ed41de2bd0>

In [17]:
sample_submission = pd.read_csv(SAMPLE_SUBMISSION)
y_test = model.predict([X_te], batch_size=1024, verbose=1)
sample_submission[list_classes] = y_test
sample_submission.to_csv(root_path + '/kaggle/working/' + 'gru_dropout_submission.csv', index=False)

[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 120ms/step


#### Todo

RNN
LSTM
GRU