# Improved Glove Rnn

In [1]:
import tensorflow as tf; 
print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Bidirectional, GlobalMaxPool1D, SimpleRNN, GRU
from keras.models import Model
# from keras import initializers, regularizers, constraints, optimizers, layers

In [3]:
%cd ../../

d:\python\Toxic-comment-classification


In [4]:
embed_size = 300 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a comment to use
embed_size_str = str(embed_size)

In [5]:
path = 'kaggle/input/'
submission_path = 'kaggle/working/rnn_submission/'
comp = 'jigsaw-toxic-comment-classification-challenge/'
clean_data_path = 'clean_data/'
EMBEDDING_FILE=f'{path}glove_embeddings/glove.6B.' + embed_size_str + 'd.txt'
TRAIN_DATA_FILE=f'{path}{comp}train.csv.zip'
TEST_DATA_FILE=f'{path}{comp}test.csv.zip'
CLEAN_TRAIN_DATA_FILE=f'{clean_data_path}data_train_cleaned_vanilla2.txt'
CLEAN_TEST_DATA_FILE=f'{clean_data_path}data_test_cleaned_vanilla2.txt'
SAMPLE_SUBMISSION=f'{path}{comp}sample_submission.csv.zip'
checkpoint_path = 'model_checkpoint/rnn'

#### Read data

In [6]:
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)

def read_from_file(filename):
    with open(filename, 'r') as f:
        return f.read().splitlines()    
    
list_sentences_train = read_from_file(CLEAN_TRAIN_DATA_FILE)
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = read_from_file(CLEAN_TEST_DATA_FILE)

Standard keras preprocessing, to turn each comment into a list of word indexes of equal length (with truncation or padding as needed).

In [7]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [8]:
# import json, io

# tokenizer_json = tokenizer.to_json()
# with io.open(checkpoint_path + 'tokenizer.json', 'w', encoding='utf-8') as f:
#     f.write(json.dumps(tokenizer_json, ensure_ascii=False))

Read the glove word vectors (space delimited strings) into a dictionary from word->vector.

In [8]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))

Use these vectors to create our embedding matrix, with random initialization for words that aren't in GloVe. We'll use the same mean and stdev of embeddings the GloVe has when generating the random init.

In [9]:
all_embs = np.stack(list(embeddings_index.values()))
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

(-0.0039050116, 0.38177028)

In [10]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

#### Create Model

In [11]:
from keras.metrics import AUC

def get_model(layertype='RNN', use_dropout=False, dropout_rate=0.1):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    if layertype == 'RNN' and not use_dropout:
        x = Bidirectional(SimpleRNN(60, return_sequences=True))(x)
    elif layertype == 'LSTM' and not use_dropout:
        x = Bidirectional(LSTM(60, return_sequences=True))(x)
    elif layertype == 'GRU' and not use_dropout:
        x = Bidirectional(GRU(60, return_sequences=True))(x)
    elif layertype == 'RNN' and use_dropout:
        x = Bidirectional(SimpleRNN(60, return_sequences=True, dropout=dropout_rate, recurrent_dropout=dropout_rate))(x)
    elif layertype == 'LSTM' and use_dropout:
        x = Bidirectional(LSTM(60, return_sequences=True, dropout=dropout_rate, recurrent_dropout=dropout_rate))(x)
    elif layertype == 'GRU' and use_dropout:
        x = Bidirectional(GRU(60, return_sequences=True, dropout=dropout_rate, recurrent_dropout=dropout_rate))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(dropout_rate)(x)
    x = Dense(60, activation="relu")(x)
    x = Dropout(dropout_rate)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=[AUC(name='auc')])
    return model

#### Create Validation Set

In [13]:
# from sklearn.model_selection import train_test_split
# X_train, X_val, y_train, y_val = train_test_split(X_t, y, test_size=0.1, random_state=7)

#### Modelcheckpoint

Use val_auc to monitor when not submitting

In [12]:
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint

# class CustomCallback(Callback):
#     def on_train_begin(self, logs={}):
#         self.losses = []
#         self.aucs = []

#     def on_batch_end(self, batch, logs={}):
#         if batch % 500 == 0:
#             loss, auc = self.model.evaluate(X_val, y_val, verbose=0)
#             self.losses.append(loss)
#             self.aucs.append(auc)
#             print(f'\nEvaluation at batch {batch}: Loss = {loss}, AUC = {auc}\n')

# # Instantiate the custom callback
# custom_callback = CustomCallback()
rnn_checkpoint = ModelCheckpoint(checkpoint_path + 'glove' + embed_size_str + '_rnn.keras', monitor='val_auc', mode='max', save_best_only=True, verbose=1)
lstm_checkpoint = ModelCheckpoint(checkpoint_path + 'glove' + embed_size_str + '_lstm.keras', monitor='val_auc', mode='max', save_best_only=True, verbose=1)
gru_checkpoint = ModelCheckpoint(checkpoint_path + 'glove' + embed_size_str + '_gru.keras', monitor='val_auc', mode='max', save_best_only=True, verbose=1)
# early_stopping = EarlyStopping(monitor='loss', min_delta=0.0005, restore_best_weights=True)

In [15]:
# import matplotlib.pyplot as plt

# def plot_loss(callback):
#     plt.figure(figsize=(12,6))
#     plt.plot(callback.losses)
#     plt.title('model loss')
#     plt.ylabel('loss')
#     plt.xlabel('500 batch')
#     plt.legend(['train', 'test'], loc='upper left')
#     plt.show()
    
# def plot_auc(callback):
#     plt.figure(figsize=(12,6))
#     plt.plot(callback.aucs)
#     plt.title('model AUC')
#     plt.ylabel('AUC')
#     plt.xlabel('500 batch')
#     plt.legend(['train', 'test'], loc='upper left')
#     plt.show()

# def plot_loss_auc(history):
#     plot_loss(history)
#     plot_auc(history)

In [24]:
import gc
gc.collect()

In [25]:
model = get_model('RNN', True, 0.15)
model.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 100)]             0         
                                                                 
 embedding_3 (Embedding)     (None, 100, 300)          6000000   
                                                                 
 bidirectional_3 (Bidirectio  (None, 100, 120)         43320     
 nal)                                                            
                                                                 
 global_max_pooling1d_3 (Glo  (None, 120)              0         
 balMaxPooling1D)                                                
                                                                 
 dropout_6 (Dropout)         (None, 120)               0         
                                                                 
 dense_6 (Dense)             (None, 60)                7260

In [26]:
batch_size = 32
epochs = 2
model.fit(X_t,y, batch_size=batch_size, epochs=epochs, callbacks=[rnn_checkpoint], validation_split=0.1)

Epoch 1/2
  19/4488 [..............................] - ETA: 24:06 - loss: 0.2724 - auc: 0.5678

KeyboardInterrupt: 

And finally, get predictions for the test set and prepare a submission CSV:

In [19]:
model.load_weights(checkpoint_path + 'glove' + embed_size_str + '_rnn.keras')
sample_submission = pd.read_csv(SAMPLE_SUBMISSION)
y_test = model.predict([X_te], batch_size=1024, verbose=1)
sample_submission[list_classes] = y_test
sample_submission.to_csv(submission_path + 'rnn_glove' + embed_size_str + '_submission.csv', index=False)

[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 79ms/step


Build Lstm model

In [27]:
del model
model = get_model('LSTM', True, 0.15)
model.summary()

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 100)]             0         
                                                                 
 embedding_4 (Embedding)     (None, 100, 300)          6000000   
                                                                 
 bidirectional_4 (Bidirectio  (None, 100, 120)         173280    
 nal)                                                            
                                                                 
 global_max_pooling1d_4 (Glo  (None, 120)              0         
 balMaxPooling1D)                                                
                                                                 
 dropout_8 (Dropout)         (None, 120)               0         
                                                                 
 dense_8 (Dense)             (None, 60)                7260

In [21]:
model.fit(X_t,y, batch_size=batch_size, epochs=epochs, callbacks=[lstm_checkpoint])

Epoch 1/2
[1m4987/4987[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - auc: 0.9554 - loss: 0.0681
Epoch 1: auc improved from -inf to 0.97706, saving model to model_checkpoint/rnnglove300_lstm.keras
[1m4987/4987[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m237s[0m 47ms/step - auc: 0.9554 - loss: 0.0681
Epoch 2/2
[1m4986/4987[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 60ms/step - auc: 0.9863 - loss: 0.0417
Epoch 2: auc improved from 0.97706 to 0.98661, saving model to model_checkpoint/rnnglove300_lstm.keras
[1m4987/4987[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m299s[0m 60ms/step - auc: 0.9863 - loss: 0.0417


<keras.src.callbacks.history.History at 0x1b8b89118d0>

In [22]:
model.load_weights(checkpoint_path + 'glove' + embed_size_str + '_lstm.keras')
sample_submission = pd.read_csv(SAMPLE_SUBMISSION)
y_test = model.predict([X_te], batch_size=1024, verbose=1)
sample_submission[list_classes] = y_test
sample_submission.to_csv(submission_path + 'lstm_glove' + embed_size_str + '_submission.csv', index=False)

[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 320ms/step


Build GRU model

In [20]:
del model
model = get_model('GRU', False, 0.1)
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 100)]             0         
                                                                 
 embedding_1 (Embedding)     (None, 100, 300)          6000000   
                                                                 
 bidirectional_1 (Bidirectio  (None, 100, 120)         130320    
 nal)                                                            
                                                                 
 global_max_pooling1d_1 (Glo  (None, 120)              0         
 balMaxPooling1D)                                                
                                                                 
 dropout_2 (Dropout)         (None, 120)               0         
                                                                 
 dense_2 (Dense)             (None, 60)                7260

In [21]:
model.fit(X_t,y, batch_size=batch_size, epochs=epochs, callbacks=[gru_checkpoint], validation_split=0.1)

Epoch 1/2
Epoch 1: val_auc improved from -inf to 0.97988, saving model to model_checkpoint\rnnglove300_gru.keras
Epoch 2/2
Epoch 2: val_auc improved from 0.97988 to 0.98020, saving model to model_checkpoint\rnnglove300_gru.keras


<keras.callbacks.History at 0x1d3a9b53fa0>

In [25]:
model.load_weights(checkpoint_path + 'glove' + embed_size_str + '_gru.keras')
sample_submission = pd.read_csv(SAMPLE_SUBMISSION)
y_test = model.predict([X_te], batch_size=1024, verbose=1)
sample_submission[list_classes] = y_test
sample_submission.to_csv(submission_path + 'gru_glove' + embed_size_str + '_submission.csv', index=False)

[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 149ms/step
