Importing Dependencies

In [1]:
import os 
import pandas as pd
import tensorflow as tf
import numpy as np

In [2]:
dataframe = pd.read_csv(os.path.join('jigsaw-toxic-comment-classification-challenge','train.csv', 'train.csv' ))

In [3]:
dataframe

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [4]:
dataframe.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

Preprocessing

In [5]:
from tensorflow.keras.layers import TextVectorization

In [6]:
x = dataframe['comment_text']
y= dataframe[dataframe.columns[2:]].values

In [19]:
x

0         Explanation\nWhy the edits made under my usern...
1         D'aww! He matches this background colour I'm s...
2         Hey man, I'm really not trying to edit war. It...
3         "\nMore\nI can't make any real suggestions on ...
4         You, sir, are my hero. Any chance you remember...
                                ...                        
159566    ":::::And for the second time of asking, when ...
159567    You should be ashamed of yourself \n\nThat is ...
159568    Spitzer \n\nUmm, theres no actual article for ...
159569    And it looks like it was actually you who put ...
159570    "\nAnd ... I really don't think you understand...
Name: comment_text, Length: 159571, dtype: object

In [10]:
y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

In [11]:
MAX_FEATURES = 200000 #No of words in the vocab

In [12]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES, output_sequence_length=1800, output_mode='int')

In [13]:
vectorizer.adapt(x.values)

In [14]:
vectorized_text = vectorizer(x.values)

In [17]:
vectorized_text

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  643,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2506, ...,     0,     0,     0],
       [  425,   440,    70, ...,     0,     0,     0],
       ...,
       [32141,  7329,   383, ...,     0,     0,     0],
       [    5,    12,   533, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]], dtype=int64)>

In [None]:
#MCSHBAP map, cache, shuffle, batch, prefetch

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)

In [None]:
batch_x, batch_y = dataset.as_numpy_iterator().next()

In [None]:
train = dataset.take(int(len(dataset)*0.7))
val = dataset.skip(int(len(dataset)*0.7)).take(int(len(dataset)*0.2))
test = dataset.take(int(len(dataset)*0.9)).take(int(len(dataset)*0.1))

Checking the size of partitions

In [None]:
print(f'Length of train is:{len(train)}, Length of validation is:{len(val)}, Length of test is:{len(test)}')

In [None]:
train.as_numpy_iterator().next()

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [None]:
model = Sequential()
model.add(Embedding(MAX_FEATURES+1, 32))
model.add(Bidirectional(LSTM(32, activation='tanh')))
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(6, activation='sigmoid'))

In [None]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [None]:
model.summary()

In [None]:
history = model.fit(train, epochs=10, validation_data=val)

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(history.history['loss'], color='blue', label='loss')
plt.plot(history.history['val_loss'], color='red', label='val_loss')
plt.title('Loss')
plt.legend()
plt.show()

In [None]:
input_text = vectorizer("You don't suck at all")

In [None]:
res = model.predict(np.expand_dims(input_text, 0))

In [None]:
labels = dataframe.columns
labels[2:]

In [None]:
batch = test.as_numpy_iterator().next()

In [None]:
batch_x, batch_y = batch

In [None]:
batch_y

In [None]:
(model.predict(batch_x) > 0.5).astype(int)

In [None]:
(res > 0.5).astype(int)

In [None]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [None]:
precision = Precision()
recall = Recall()
accuracy = CategoricalAccuracy()

In [None]:
for batch in test.as_numpy_iterator():
    x_true, y_true = batch
    
    yhat = model.predict(x_true)

    y_true = y_true.flatten()
    yhat = yhat.flatten()

    precision.update_state(y_true, yhat)
    recall.update_state(y_true, yhat)
    accuracy.update_state(y_true, yhat)

In [None]:
print(f'Precision{precision.result().numpy()}, Recall:{recall.result().numpy()}, Accuracy:{accuracy.result().numpy()},')

In [None]:
import gradio as gr

In [None]:
model.save('toxicity.h5')

In [None]:
from tensorflow.keras.models import load_model

In [None]:
toxicity = load_model('toxicity.h5')

In [None]:
toxicity.summary()

In [None]:
input_str = vectorizer('You freaking suck at this game')

In [None]:
res = model.predict(np.expand_dims(input_str, 0))

In [None]:
(res > 0.5).astype(int)

In [None]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    
    text = ''
    for idx, col in enumerate(dataframe.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)
    
    return text

In [None]:
interface = gr.Interface(fn=score_comment, 
                         inputs=gr.inputs.Textbox(lines=2, placeholder='Comment to score'),
                        outputs='text')

In [None]:
interface.launch(share=True)

Saving vectorizer model

In [None]:
from tensorflow.keras.models import load_model

In [None]:
textvect = tf.keras.models.Sequential()
textvect.add(tf.keras.Input(shape=(1,), dtype=tf.string))
textvect.add(vectorizer)

In [None]:
textvect.save('vectorizer.tf')

In [None]:
loaded_vect_model = load_model('vectorizer.tf')
vect_model = loaded_vect_model.layers[0]

In [None]:
print(vect_model(x.values))

In [None]:
print(vectorizer(x.values))