In [1]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [2]:
data=pd.read_csv('train.csv')
data.head(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [3]:
data.toxic.value_counts()

0    144277
1     15294
Name: toxic, dtype: int64

In [4]:
data.severe_toxic.value_counts()

0    157976
1      1595
Name: severe_toxic, dtype: int64

In [5]:
data.obscene.value_counts()

0    151122
1      8449
Name: obscene, dtype: int64

In [6]:
data.insult.value_counts()

0    151694
1      7877
Name: insult, dtype: int64

In [7]:
data.identity_hate.value_counts()

0    158166
1      1405
Name: identity_hate, dtype: int64

In [8]:
data['comment_text'][10]

'"\nFair use rationale for Image:Wonju.jpg\n\nThanks for uploading Image:Wonju.jpg. I notice the image page specifies that the image is being used under fair use but there is no explanation or rationale as to why its use in Wikipedia articles constitutes fair use. In addition to the boilerplate fair use template, you must also write out on the image description page a specific explanation or rationale for why using this image in each article is consistent with fair use.\n\nPlease go to the image description page and edit it to include a fair use rationale.\n\nIf you have uploaded other fair use media, consider checking that you have specified the fair use rationale on those pages too. You can find a list of \'image\' pages you have edited by clicking on the ""my contributions"" link (it is located at the very top of any Wikipedia page when you are logged in), and then selecting ""Image"" from the dropdown box. Note that any fair use images uploaded after 4 May, 2006, and lacking such a

In [9]:
 data.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

In [10]:
data.shape

(159571, 8)

In [11]:
data.isnull().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

## Preprocessing

In [12]:
from tensorflow.keras.layers import TextVectorization

In [13]:
x=data['comment_text']
y=data[data.columns[2:]].values

In [14]:
x

0         Explanation\nWhy the edits made under my usern...
1         D'aww! He matches this background colour I'm s...
2         Hey man, I'm really not trying to edit war. It...
3         "\nMore\nI can't make any real suggestions on ...
4         You, sir, are my hero. Any chance you remember...
                                ...                        
159566    ":::::And for the second time of asking, when ...
159567    You should be ashamed of yourself \n\nThat is ...
159568    Spitzer \n\nUmm, theres no actual article for ...
159569    And it looks like it was actually you who put ...
159570    "\nAnd ... I really don't think you understand...
Name: comment_text, Length: 159571, dtype: object

In [15]:
y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

In [16]:
max_features=200000

In [17]:
vectorizer=TextVectorization(max_tokens=max_features,
                            output_sequence_length=1800,
                            output_mode='int')

In [18]:
vectorizer.get_vocabulary()

['', '[UNK]']

In [19]:
vectorizer.adapt(x.values)

In [20]:
vectorizer("have you watched breaking bad")[:5]

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([  19,    7, 3666, 2891,  338], dtype=int64)>

In [21]:
vectorized_text=vectorizer(x.values)

In [22]:
vectorized_text

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  645,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2489, ...,     0,     0,     0],
       [  425,   441,    70, ...,     0,     0,     0],
       ...,
       [32445,  7392,   383, ...,     0,     0,     0],
       [    5,    12,   534, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]], dtype=int64)>

In [23]:
dataset=tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset=dataset.cache()
dataset=dataset.shuffle(160000)
dataset=dataset.batch(16)
dataset=dataset.prefetch(8)

In [24]:
159571/16

9973.1875

In [25]:
batch_x, batch_y = dataset.as_numpy_iterator().next()

In [26]:
batch_x.shape

(16, 1800)

In [27]:
batch_y.shape

(16, 6)

In [28]:
len(dataset)

9974

In [29]:
int(len(dataset)*.7)

6981

In [30]:
train=dataset.take(int(len(dataset)*.7))
val=dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test=dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [31]:
len(train),len(val),len(test)

(6981, 1994, 997)

In [32]:
train_generator=train.as_numpy_iterator()

In [33]:
train_generator.next()

(array([[    2,  1180,   121, ...,     0,     0,     0],
        [    1,     2,  6695, ...,     0,     0,     0],
        [ 9785,     2,   394, ...,     0,     0,     0],
        ...,
        [  607,   206,  1419, ...,     0,     0,     0],
        [  274, 14667,    31, ...,     0,     0,     0],
        [  451, 20849,  5299, ...,     0,     0,     0]], dtype=int64),
 array([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 1, 1, 0, 1, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]], dtype=int64))

In [34]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [35]:
model=Sequential()

In [36]:
model.add(Embedding(max_features+1, 32))
model.add(Bidirectional(LSTM(32, activation='tanh')))
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(6, activation='sigmoid'))

In [37]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [38]:
model.summary()

In [39]:
history=model.fit(train, epochs=3, validation_data=val)

Epoch 1/3
[1m6981/6981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5106s[0m 730ms/step - accuracy: 0.9561 - loss: 0.0839 - val_accuracy: 0.9945 - val_loss: 0.0492
Epoch 2/3
[1m6981/6981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5093s[0m 730ms/step - accuracy: 0.9931 - loss: 0.0466 - val_accuracy: 0.9943 - val_loss: 0.0405
Epoch 3/3
[1m6981/6981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5108s[0m 732ms/step - accuracy: 0.9928 - loss: 0.0409 - val_accuracy: 0.9947 - val_loss: 0.0350


In [40]:
model.evaluate(test)

[1m997/997[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 125ms/step - accuracy: 0.9950 - loss: 0.0345


[0.03380872681736946, 0.9952983856201172]

In [41]:
x_batch, y_batch = test.as_numpy_iterator().next()

In [42]:
(model.predict(x_batch) > 0.5).astype(int)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step


array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [43]:
y_batch

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

In [44]:
input_text=vectorizer('I hate you so much, that i cant stand you')

In [45]:
input_text[:7]

<tf.Tensor: shape=(7,), dtype=int64, numpy=array([  8, 363,   7,  37, 129,  10,   8], dtype=int64)>

In [46]:
batch=test.as_numpy_iterator().next()

In [47]:
res=model.predict(np.expand_dims(input_text,0))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step


In [48]:
data.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [49]:
res

array([[0.58440024, 0.00178293, 0.04463793, 0.01276273, 0.11293302,
        0.03650785]], dtype=float32)

## Test and Gradio

In [50]:
import gradio as gr

In [51]:
input_str=vectorizer('I love you')

In [52]:
res=model.predict(np.expand_dims(input_str,0))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 149ms/step


In [53]:
res

array([[6.7226656e-02, 6.3627856e-05, 7.0677325e-03, 2.2383926e-03,
        9.4481129e-03, 7.5157871e-03]], dtype=float32)

In [62]:
data.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [67]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results=model.predict(vectorized_comment)
    
    text=''
    for idx, col in enumerate(data.columns[2:]):
        text+= '{}: {}\n'.format(col, results[0][idx]>0.5)
        
    return text

In [68]:
interface = gr.Interface(fn=score_comment, 
                         inputs=[
                             gr.Textbox(label="Enter Text", placeholder="Type here...", lines=3)
                         ], 
                         outputs="text",
                         title="<div style='text-align: center; font-size: 62px;'><b>Toxic Comment Analyzer</b></div>", 
                         description="<div style='font-size: 22px;'><b>Enter the text to find whether the comment is toxic or not</b>",
                         css=".gradio-interface { background-color: #f0f0f0 !important; }"
                         )

In [69]:
interface.launch(share=True, auth=('kartik','12345678'),auth_message="Enter the Username and Password")

Running on local URL:  http://127.0.0.1:7863

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 156ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 155ms/step


In [58]:
import pickle
import joblib

In [59]:
#This are some common model while saving the model
#AttributeError: Can't pickle local object 'Layer._initialize_tracker.<locals>.<lambda>'
#PicklingError: Can't pickle <function Layer._initialize_tracker.<locals>.<lambda> at 0x000002B1368B6480>: it's not found as keras.src.layers.layer.Layer._initialize_tracker.<locals>.<lambda>

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 179ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 163ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step


In [61]:
model.save('toxic.keras')