# Importing our dependencies


In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

In [3]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/comment-toxicity/data/train.csv/train.csv')
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
df.iloc[0]

Unnamed: 0,0
id,0000997932d777bf
comment_text,Explanation\nWhy the edits made under my usern...
toxic,0
severe_toxic,0
obscene,0
threat,0
insult,0
identity_hate,0


# 1.Preprocess

In [5]:
from tensorflow.keras.layers import TextVectorization

In [6]:
x = df['comment_text']
y = df[df.columns[2:]].values

In [7]:
MAX_FEATURES = 200000 # number of words in the vocab

In [8]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=1800,
                               output_mode='int')

In [9]:
vectorizer.adapt(x.values)



In [10]:
vectorized_text = vectorizer(x.values)

In [11]:
vectorized_text

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  645,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2489, ...,     0,     0,     0],
       [  425,   441,    70, ...,     0,     0,     0],
       ...,
       [32445,  7392,   383, ...,     0,     0,     0],
       [    5,    12,   534, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]])>

In [12]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)

In [13]:
batch_x , batch_y = dataset.as_numpy_iterator().next()


In [14]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [15]:
train_generator = train.as_numpy_iterator()

In [16]:
train_generator.next()

(array([[   312,      7,    708, ...,      0,      0,      0],
        [  1436,    956,    204, ...,      0,      0,      0],
        [  1934,    517,     70, ...,      0,      0,      0],
        ...,
        [   451,    126, 130375, ...,      0,      0,      0],
        [     8,     69,     10, ...,      0,      0,      0],
        [  4008, 163388,  17542, ...,      0,      0,      0]]),
 array([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]]))

# creating the mode

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense , Embedding

In [25]:
model = Sequential()
# Bidirectional LSTM Layer
model.add(Embedding(MAX_FEATURES+1 , 32))

model.add(Bidirectional(LSTM(32, activation='tanh', return_sequences=False)))

model.add(Dense(128 , activation='relu'))

model.add(Dense(256 , activation='relu'))


model.add(Dense(128 , activation='relu'))

model.add(Dense(6, activation='sigmoid'))

In [26]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [27]:
model.summary()

In [28]:
history = model.fit(train, epochs=1, validation_data=val)

[1m6981/6981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m741s[0m 105ms/step - loss: 0.0827 - val_loss: 0.0481


# Make prediction

In [58]:
input_text = vectorizer('I hate you')

In [59]:
df.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [72]:
batch = test.as_numpy_iterator().next()

In [73]:
batch_x , batch_y = test.as_numpy_iterator().next()

In [74]:
(model.predict(batch_x) > 0.5).astype("int")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step


array([[0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

# evaluate the model

In [75]:
from tensorflow.keras.metrics import Precision , Recall , CategoricalAccuracy

In [77]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [None]:
for batch in test.as_numpy_iterator():
  x_true , y_true = batch
  yhat = model.predict(x_true)

  y_true = y_true.flatten()
  yhat = yhat.flatten()

  pre.update_state(y_true , yhat)
  re.update_state(y_true , yhat)
  acc.update_state(y_true , yhat)

In [79]:
print(f'Precision :{pre.result().numpy():.2f}')
print(f'Recall :{re.result().numpy():.2f}')
print(f'Accuracy :{acc.result().numpy():.2f}')

Precision :0.78
Recall :0.72
Accuracy :0.49


In [None]:
!pip install gradio jinja2

In [82]:
model.save('toxicity.h5')



In [84]:
def score_comment(comment):
  vectorized_comment = vectorizer([comment])
  results = model.predict(vectorized_comment)

  text = ''
  for idx, col in enumerate(df.columns[2:]):
    text += '{}: {}\n'.format(col, results[0][idx]>0.5)

  return text

In [99]:
score_comment("I love you")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step


'toxic: False\nsevere_toxic: False\nobscene: False\nthreat: False\ninsult: False\nidentity_hate: False\n'

In [100]:
score_comment("I hate you")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step


'toxic: True\nsevere_toxic: False\nobscene: False\nthreat: False\ninsult: False\nidentity_hate: False\n'

In [104]:
score_comment("i will come and kill you")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step


'toxic: True\nsevere_toxic: False\nobscene: True\nthreat: False\ninsult: True\nidentity_hate: False\n'