<a href="https://colab.research.google.com/github/HussamSelim/NLP_JIGSAW-Using-RNN_LSTM_GRU/blob/main/notebook5f1d681345.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np 
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU,SimpleRNN
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import BatchNormalization
# from tensorflow.keras import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from tensorflow.keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from tensorflow.keras.preprocessing import sequence, text
from tensorflow.keras.callbacks import EarlyStopping


import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

In [None]:
train = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv')
validation = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv')
test = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv')

In [None]:
train.drop(['severe_toxic','obscene','threat','insult','identity_hate'],axis=1,inplace=True)

In [None]:
train=train.loc[:12000,:]

In [None]:
train['comment_text'].apply(lambda x:len(str(x).split())).max()

In [None]:
def roc_auc(predictions,target):
    '''
    This methods returns the AUC Score when given the Predictions
    and Labels
    '''
    
    fpr, tpr, thresholds = metrics.roc_curve(target, predictions)
    roc_auc = metrics.auc(fpr, tpr)
    return roc_auc

In [None]:
X_train,X_valid,y_train,y_valid=train_test_split(train.comment_text.values,train.toxic.values,
                                               stratify=train.toxic.values,
                                               random_state=42,
                                               test_size=0.3
                                              
                                              )

In [None]:
token=text.Tokenizer(num_words=None)
max_len=1500

token.fit_on_texts(list(X_train)+list(X_valid))

x_train_seq=token.texts_to_sequences(X_train)
x_valid_seq=token.texts_to_sequences(X_valid)

x_train_padded=sequence.pad_sequences(x_train_seq,maxlen=max_len)
x_valid_padded=sequence.pad_sequences(x_valid_seq,maxlen=max_len)


In [None]:
word_index= token.word_index

In [None]:
model1=Sequential()
model1.add(Embedding(len(word_index)+1, 300, input_length=max_len))
model1.add(SimpleRNN(100))
model1.add(Dense(1,activation='sigmoid'))
model1.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
model1.summary()

In [None]:
checkpoint_path = "./training_1/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)


In [None]:
model1.fit(x_train_padded, y_train, epochs=5, batch_size=64,validation_data=(x_valid_padded, y_valid),
          callbacks=[cp_callback])

In [None]:
scores= model1.predict(x_valid_padded)
print("Auc: %.2f%%" % (roc_auc(scores,y_valid)))

In [None]:
scores_model = []
scores_model.append({'Model': 'SimpleRNN','AUC_Score': roc_auc(scores,y_valid)})

## Let's use GloVe Word Embeddings instead of setting them randomly

In [None]:
embeddings_index = {}
f = open('/kaggle/input/glove840b300dtxt/glove.840B.300d.txt','r',encoding='utf-8')
for line in tqdm(f):
    values=line.split(" ")
    word=values[0]
    coeff=np.asarray([float(val) for val in values[1:]])
    embeddings_index[word]=coeff
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
#creating embedding matrix for the words we have in the dataset

embedding_matrix=np.zeros((len(word_index)+1,300))

for word,i in tqdm(word_index.items()):
    embedding_vector=embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i]=embedding_vector
        

In [None]:
model2=Sequential()
model2.add(Embedding(len(word_index)+1,300,
                     weights=[embedding_matrix],
                      input_length=max_len,
                      trainable=False
                     ))
model2.add(LSTM(80, dropout=0.3, recurrent_dropout=0.3))
model2.add(Dense(1,activation='sigmoid'))
model2.compile(loss='binary_crossentropy',optimizer='RMSprop', metrics=['accuracy'])

model2.summary()

In [None]:
checkpoint_path = "./training_2/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)
cp_callback2 = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

model2.fit(x_train_padded,y_train,epochs=5,batch_size=128,callbacks=[cp_callback2])


In [None]:
scores = model2.predict(x_valid_padded)
print("Auc: %.2f%%" % (roc_auc(scores,y_valid)))

scores_model.append({'Model': 'LSTM','AUC_Score': roc_auc(scores,y_valid)})

We now see that the model is not overfitting and achieves an auc score of 0.96 which is quite commendable , also we close in on the gap between accuracy and auc . We see that in this case we used dropout and prevented overfitting the data

In [None]:
model3=Sequential()
model3.add(Embedding(len(word_index)+1,300,
                     weights=[embedding_matrix],
                      input_length=max_len,
                      trainable=False
                     ))
model3.add(SpatialDropout1D(0.3))
model3.add(GRU(120))
model3.add(Dense(1,activation='sigmoid'))
model3.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

model3.summary()

In [None]:
checkpoint_path = "./training_3/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)
cp_callback3 = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

model3.fit(x_train_padded,y_train,epochs=5,batch_size=64,callbacks=[cp_callback3])


In [None]:
scores = model3.predict(x_valid_padded)
print("Auc: %.2f%%" % (roc_auc(scores,y_valid)))

scores_model.append({'Model': 'GRU','AUC_Score': roc_auc(scores,y_valid)})

In [None]:
# Visualization of Results obtained from various Deep learning models
results = pd.DataFrame(scores_model).sort_values(by='AUC_Score',ascending=False)
results.style.background_gradient(cmap='Blues')

In [None]:
fig = go.Figure(go.Funnelarea(
    text =results.Model,
    values = results.AUC_Score,
    title = {"position": "top center", "text": "Funnel-Chart of Sentiment Distribution"}
    ))
fig.show()