In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU,SimpleRNN
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping


import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

  shapely_geos_version, geos_capi_version_string


In [2]:
train = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv')
validation = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv')
test = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv')

I am going to solve this problem as a binary classification and not a multi-label classification task

In [3]:
train.drop(['severe_toxic','obscene','threat','insult','identity_hate'],axis=1,inplace=True)

In [4]:
train.columns

Index(['id', 'comment_text', 'toxic'], dtype='object')

In [5]:
#using the 5000 rows for training

train = train.sample(5000)
train.shape

(5000, 3)

#checking the maximum length of strings for the texts

In [6]:
train['comment_text'].apply(lambda x:len(str(x).split())).max()

1403

Writing a function for getting auc score for validation

In [7]:
def roc_auc(predictions,target):
    '''
    This methods returns the AUC Score when given the Predictions and Labels
    '''
    
    fpr, tpr, thresholds = metrics.roc_curve(target, predictions)
    roc_auc_score = metrics.auc(fpr, tpr)
    return roc_auc_score

### Data Preparation

In [8]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train.comment_text.values, train.toxic.values, 
                                                  stratify=train.toxic.values, 
                                                  random_state=42, 
                                                  test_size=0.25, shuffle=True)

In [9]:
# using keras tokenizer 
token = text.Tokenizer(num_words=None)
max_len = 1500

token.fit_on_texts(list(xtrain) + list(xvalid))

#converting texts to sequences
xtrain_seq = token.texts_to_sequences(xtrain)
xvalid_seq = token.texts_to_sequences(xvalid)

#zero paddingthe sequences
xtrain_pad = sequence.pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = sequence.pad_sequences(xvalid_seq, maxlen=max_len)

#checking the vocabs
word_index = token.word_index

In [10]:

# A simpleRNN without any pretrained embeddings and one dense layer
model = Sequential()
model.add(Embedding(len(word_index) + 1,300,input_length=max_len))
model.add(SimpleRNN(50))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1500, 300)         8022900   
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 50)                17550     
_________________________________________________________________
dense (Dense)                (None, 1)                 51        
Total params: 8,040,501
Trainable params: 8,040,501
Non-trainable params: 0
_________________________________________________________________


In [12]:
model.fit(xtrain_pad, ytrain, validation_data = (xvalid_pad,yvalid), epochs=10, batch_size=128) 

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7dd5ec3dafd0>

In [13]:
scores = model.predict(xvalid_pad)
print(f'rouc_auc_score: {roc_auc(scores,yvalid)}')

rouc_auc_score: ,0.777860539735289


In [14]:
scores_model = []
scores_model.append({'Model': 'SimpleRNN','AUC_Score': roc_auc(scores,yvalid)})

# Word Embeddings



In [15]:
# load the GloVe vectors in a dictionary:

embeddings_index = {}
f = open('/kaggle/input/glove840b300dtxt/glove.840B.300d.txt','r',encoding='utf-8')
for line in tqdm(f):
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray([float(val) for val in values[1:]])
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

2196018it [05:08, 7115.55it/s]

Found 2196017 word vectors.





In [16]:
# create an embedding matrix for the words we have in the dataset
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

100%|██████████| 26742/26742 [00:00<00:00, 232462.96it/s]


In [17]:

    
# A simple LSTM with glove embeddings and one dense layer
model = Sequential()
model.add(Embedding(len(word_index) + 1,300,weights=[embedding_matrix],input_length=max_len,trainable=False))
#always set trainable as false when using glove with the neural network model in the embedding layer.

model.add(LSTM(50, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
    
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1500, 300)         8022900   
_________________________________________________________________
lstm (LSTM)                  (None, 50)                70200     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 51        
Total params: 8,093,151
Trainable params: 70,251
Non-trainable params: 8,022,900
_________________________________________________________________


In [18]:
model.fit(xtrain_pad, ytrain, validation_data = (xvalid_pad,yvalid), epochs=3, batch_size=128)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7dd199eb4fd0>

In [20]:
scores = model.predict(xvalid_pad)
print(f'roc_auc_score,{roc_auc(scores,yvalid)}')

roc_auc_score,0.9460551194637025


In [21]:
scores_model.append({'Model': 'LSTM','AUC_Score': roc_auc(scores,yvalid)})

Using GRU

In [22]:

    # GRU with glove embeddings and two dense layers
model = Sequential()
model.add(Embedding(len(word_index) + 1,300,weights=[embedding_matrix],input_length=max_len,trainable=False))
model.add(SpatialDropout1D(0.25))
model.add(GRU(50))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])   
    
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1500, 300)         8022900   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 1500, 300)         0         
_________________________________________________________________
gru (GRU)                    (None, 50)                52650     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 51        
Total params: 8,075,601
Trainable params: 52,701
Non-trainable params: 8,022,900
_________________________________________________________________


In [23]:
model.fit(xtrain_pad, ytrain, batch_size=64,validation_data = (xvalid_pad,yvalid), epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7dd1bee16610>

In [24]:
scores = model.predict(xvalid_pad)
print(f'roc_auc_score: {roc_auc(scores,yvalid)}')

roc_auc_score: 0.9644044003896178


In [25]:
scores_model.append({'Model': 'GRU','AUC_Score': roc_auc(scores,yvalid)})

In [26]:
scores_model

[{'Model': 'SimpleRNN', 'AUC_Score': 0.777860539735289},
 {'Model': 'LSTM', 'AUC_Score': 0.9460551194637025},
 {'Model': 'GRU', 'AUC_Score': 0.9644044003896178}]

# Using Bi-Directional RNN's



In [27]:

# A simple bidirectional LSTM with glove embeddings and one dense layer
model = Sequential()
model.add(Embedding(len(word_index) + 1,300,weights=[embedding_matrix],input_length=max_len,trainable=False))
model.add(Bidirectional(LSTM(50, dropout=0.3, recurrent_dropout=0.3)))

model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
    
    
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 1500, 300)         8022900   
_________________________________________________________________
bidirectional (Bidirectional (None, 100)               140400    
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 101       
Total params: 8,163,401
Trainable params: 140,501
Non-trainable params: 8,022,900
_________________________________________________________________


In [28]:
model.fit(xtrain_pad, ytrain, batch_size=64,validation_data = (xvalid_pad,yvalid), epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7dd5d4c0ba90>

In [30]:
scores = model.predict(xvalid_pad)
print(f'roc_auc_score: {roc_auc(scores,yvalid)}')

roc_auc_score: 0.9675557210794705


In [31]:
scores_model.append({'Model': 'Bi-directional LSTM','AUC_Score': roc_auc(scores,yvalid)})

In [32]:
scores_model

[{'Model': 'SimpleRNN', 'AUC_Score': 0.777860539735289},
 {'Model': 'LSTM', 'AUC_Score': 0.9460551194637025},
 {'Model': 'GRU', 'AUC_Score': 0.9644044003896178},
 {'Model': 'Bi-directional LSTM', 'AUC_Score': 0.9675557210794705}]

In [52]:
# Model = []
# AUC_Score= []
# for i in range(len(scores_model)):
#     AUC_Score.append(scores_model[i]['AUC_Score'])
#     Model.append(scores_model[i]['Model'])

pd.DataFrame(scores_model)

Unnamed: 0,Model,AUC_Score
0,SimpleRNN,0.777861
1,LSTM,0.946055
2,GRU,0.964404
3,Bi-directional LSTM,0.967556


Awesome! We can see that the Bi-directional LSTM Model was the best with an auc_score of 0.967556, next is the GRU(Gated Reccurent Units) which also got a similar score like the birectional lstm of 0.964404. LSTM also perfomed well with auc score of 0.946055. 
Simple rnn was the least with 0.77861, the model was too simple and usually considered as a naive baseline for the more sophisticated models like LSTM. 

The results could become better if I had allowed it to run for more epochs but because of time and unavailability of a GPU right now, i didn't train for long. If you are using this notebook. Try to train for 100 epochs and add drop out layers to prevent overfitting. Check kaggle for more interesting problems to solve.

In [51]:
# Visualization of Results obtained from various Deep learning models
results = pd.DataFrame(scores_model).sort_values(by='AUC_Score',ascending=False)
results.style.background_gradient(cmap='Blues')

Unnamed: 0,Model,AUC_Score
3,Bi-directional LSTM,0.967556
2,GRU,0.964404
1,LSTM,0.946055
0,SimpleRNN,0.777861


In [53]:
fig = go.Figure(go.Funnelarea(
    text =results.Model,
    values = results.AUC_Score,
    title = {"position": "top center", "text": "Funnel-Chart of Sentiment Distribution"}
    ))
fig.show()