In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score 
from keras.layers import Dense,Input,LSTM,Bidirectional,Activation,Conv1D,GRU
from keras.callbacks import Callback
from keras.layers import Dropout,Embedding,GlobalMaxPooling1D, MaxPooling1D, Add, Flatten
from keras.preprocessing import text, sequence
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.models import Model

Initalize Parameters. 

Kindly Download Glove (840B300d) file from this link:https://nlp.stanford.edu/projects/glove/ and dataset from this link: https://www.kaggle.com/competitions/jigsaw-toxic-comment-classification-challenge/data

Please set the path of embedding of input data as per your local system.

In [None]:
max_features=150000
maxlen=150
embed_size=300
batch_size = 128
epochs = 4
EMBEDDING_FILE = '/content/drive/MyDrive/glove.840B.300d.txt'
train = pd.read_csv('/content/drive/MyDrive/train.csv.zip')
test = pd.read_csv('/content/drive/MyDrive/test.csv.zip')
test_y = pd.read_csv("/content/drive/MyDrive/Data/test_labels.csv")

Data Preprocessing, tokenization, padding and converting to sequences

In [None]:
train["comment_text"].fillna("fillna")
test["comment_text"].fillna("fillna")
X_train = train["comment_text"].str.lower()
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
test_y = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip")
X_test = test["comment_text"].str.lower()

tok=text.Tokenizer(num_words=max_features,lower=True)
tok.fit_on_texts(list(X_train)+list(X_test))
X_train=tok.texts_to_sequences(X_train)
X_test=tok.texts_to_sequences(X_test)
x_train=sequence.pad_sequences(X_train,maxlen=maxlen)
x_test=sequence.pad_sequences(X_test,maxlen=maxlen)

Read Glove Embedding file and create embedding matrix.

In [None]:
embeddings_index = {}
with open(EMBEDDING_FILE,encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
word_index = tok.word_index
num_words = min(max_features, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embed_size))
for word, i in word_index.items():
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

Split training data into training and validation

In [None]:
X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.9, random_state=233)

ROC AUC Evaluation for every epoch

In [None]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch+1, score))

Model 1: Custom Embedding and without Glove Embeding

In [None]:
model = tf.keras.Sequential([tf.keras.layers.Embedding(max_features, embed_size),
    tf.keras.layers.SpatialDropout1D(0.2),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences = True)),
    tf.keras.layers.Conv1D(filters=256, kernel_size=2, padding='valid', kernel_initializer='glorot_uniform'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(6, activation='sigmoid')])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(),optimizer=tf.keras.optimizers.Adam(),metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 300)         30000000  
                                                                 
 spatial_dropout1d (SpatialD  (None, None, 300)        0         
 ropout1D)                                                       
                                                                 
 bidirectional (Bidirectiona  (None, None, 256)        439296    
 l)                                                              
                                                                 
 conv1d (Conv1D)             (None, None, 256)         131328    
                                                                 
 global_max_pooling1d (Globa  (None, 256)              0         
 lMaxPooling1D)                                                  
                                                        

Proposed Model: Using Glove Embedding

In [None]:
model = None
sequence_input = Input(shape=(maxlen, ))
x = Embedding(max_features, embed_size, weights=[embedding_matrix],trainable = False)(sequence_input)
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(GRU(128, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
x = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool]) 
preds = Dense(6, activation="sigmoid")(x)
model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',optimizer=tf.keras.optimizers.Adam(lr=1e-3),metrics=['accuracy'])



In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 150)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 150, 300)     30000000    ['input_1[0][0]']                
                                                                                                  
 spatial_dropout1d (SpatialDrop  (None, 150, 300)    0           ['embedding[0][0]']              
 out1D)                                                                                           
                                                                                                  
 bidirectional (Bidirectional)  (None, 150, 256)     330240      ['spatial_dropout1d[0][0]']  

Training

In [None]:
filepath="best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
early = EarlyStopping(monitor="val_loss", mode="auto", patience=5)
ra_val = RocAucEvaluation(validation_data=(X_val, y_val), interval = 1)
callbacks_list = [ra_val, early]

Please note that every time we train either model 1 or model 2 and then predict on test data using that particular model. So you might see the same variable names for each of the model and prediction on test data. While running the notebook again, please compile either of the model first, train it and run predictions before compiling another model.

In [None]:
model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),callbacks = callbacks_list,verbose=1)

Epoch 1/4
 ROC-AUC - epoch: 1 - score: 0.975653
Epoch 2/4
 ROC-AUC - epoch: 2 - score: 0.981289
Epoch 3/4
 ROC-AUC - epoch: 3 - score: 0.982110
Epoch 4/4
 ROC-AUC - epoch: 4 - score: 0.980355


Model 1 Prediction

In [None]:
model.load_weights(filepath)
print('Predicting....')
y_pred = model.predict(x_test,batch_size=1024,verbose=1)

Predicting....


Model 2 Prediction

In [None]:
model.load_weights(filepath)
y_pred = model.predict(x_test,batch_size=1024,verbose=1)



The test data consists of some samples which are irrelevant as per reported by the data source. Those samples have output -1 for all labels so we filter them out and test on remaining data.

Model 1: ROC AUC SCORE

In [None]:
labels = ["toxic", "severe_toxic", "obscene",
               "threat", "insult", "identity_hate"]
predict_df = pd.DataFrame()
predict_df['id'] = test_y['id']
predict_df[labels] = y_pred
roc_auc_score(test_y[test_y['toxic']!=-1][labels],predict_df[test_y['toxic']!=-1][labels],average='weighted')

0.9653701634577642

Model 2: ROC AUC SCORE

In [None]:
labels = ["toxic", "severe_toxic", "obscene",
               "threat", "insult", "identity_hate"]
predict_df = pd.DataFrame()
predict_df['id'] = test_y['id']
predict_df[labels] = y_pred
roc_auc_score(test_y[test_y['toxic']!=-1][labels],predict_df[test_y['toxic']!=-1][labels],average='weighted')

0.9768066140489071


Please ignore the below code cells as it was used to convert the notebook to pdf file




In [None]:
!wget -nc https://raw.githubusercontent.com/brpy/colab-pdf/master/colab_pdf.py

In [None]:
from colab_pdf import colab_pdf
colab_pdf('MLProject_DeepLearningModels_BiLSTMCNN.ipynb')





Extracting templates from packages: 100%
[NbConvertApp] Converting notebook /content/drive/MyDrive/Colab Notebooks/MLProject_DeepLearningModels_BiLSTMCNN.ipynb to pdf
[NbConvertApp] Writing 51740 bytes to ./notebook.tex
[NbConvertApp] Building PDF
[NbConvertApp] Running xelatex 3 times: ['xelatex', './notebook.tex', '-quiet']
[NbConvertApp] Running bibtex 1 time: ['bibtex', './notebook']
[NbConvertApp] PDF successfully created
[NbConvertApp] Writing 47681 bytes to /content/drive/My Drive/MLProject_DeepLearningModels_BiLSTMCNN.pdf


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

'File ready to be Downloaded and Saved to Drive'