# Install Dependencies and Get Data

In [1]:
!pip install tensorflow tensorflow-gpu pandas matplotlib sklearn

Collecting tensorflow
  Downloading tensorflow-2.8.0-cp39-cp39-win_amd64.whl (438.0 MB)
Collecting tensorflow-gpu
  Downloading tensorflow_gpu-2.8.0-cp39-cp39-win_amd64.whl (438.0 MB)
Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Collecting tensorboard<2.9,>=2.8
  Using cached tensorboard-2.8.0-py3-none-any.whl (5.8 MB)
Collecting astunparse>=1.6.0
  Using cached astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting tf-estimator-nightly==2.8.0.dev2021122109
  Using cached tf_estimator_nightly-2.8.0.dev2021122109-py2.py3-none-any.whl (462 kB)
Collecting opt-einsum>=2.3.2
  Using cached opt_einsum-3.3.0-py3-none-any.whl (65 kB)
Collecting flatbuffers>=1.12
  Using cached flatbuffers-2.0-py2.py3-none-any.whl (26 kB)
Collecting absl-py>=0.4.0
  Using cached absl_py-1.0.0-py3-none-any.whl (126 kB)
Collecting keras<2.9,>=2.8.0rc0
  Using cached keras-2.8.0-py2.py3-none-any.whl (1.4 MB)
Collecting keras-preprocessing>=1.1.1
  Using cached Keras_Preprocessing-1.1.2-py2.py3-no

In [2]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [3]:
os.path.join('jigsaw-toxic-comment-classification-challenge', 'train.csv', 'train.csv')

'jigsaw-toxic-comment-classification-challenge\\train.csv\\train.csv'

In [4]:
df = pd.read_csv(os.path.join('jigsaw-toxic-comment-classification-challenge', 'train.csv', 'train.csv'))

In [5]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [10]:
df.iloc[15]['comment_text']

'"\n\nJuelz Santanas Age\n\nIn 2002, Juelz Santana was 18 years old, then came February 18th, which makes Juelz turn 19 making songs with The Diplomats. The third neff to be signed to Cam\'s label under Roc A Fella. In 2003, he was 20 years old coming out with his own singles ""Santana\'s Town"" and ""Down"". So yes, he is born in 1983. He really is, how could he be older then Lloyd Banks? And how could he be 22 when his birthday passed? The homie neff is 23 years old. 1983 - 2006 (Juelz death, god forbid if your thinking about that) equals 23. Go to your caculator and stop changing his year of birth. My god."'

In [12]:
df[df['severe_toxic']==1].head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
55,0020e7119b96eeeb,Stupid peace of shit stop deleting my stuff as...,1,1,1,0,1,0
181,006e87872c8b370c,you are a stupid fuck \n\nand your mother's cu...,1,1,1,0,1,0
442,01208d2b76624130,Hi \n\nIm a fucking bitch.\n\n50.180.208.181,1,1,1,0,1,0
579,018663f910e0bfe6,What a motherfucking piece of crap those fuckh...,1,1,1,0,1,0


In [13]:
df[df.columns[2:]].iloc[6]

toxic            1
severe_toxic     1
obscene          1
threat           0
insult           1
identity_hate    0
Name: 6, dtype: int64

# Pre-Process

In [14]:
from tensorflow.keras.layers import TextVectorization

In [15]:
X = df['comment_text']
y = df[df.columns[2:]].values

In [19]:
# Number of words in the vocabulary
MAX_FEATURES = 200000

In [20]:
vectorizer = TextVectorization(max_tokens = MAX_FEATURES,
                               output_sequence_length=1800,
                               output_mode='int')

In [21]:
vectorizer.adapt(X.values)

In [23]:
vectorizer('Hello world, I am learning')

<tf.Tensor: shape=(1800,), dtype=int64, numpy=array([286, 261,   8, ...,   0,   0,   0], dtype=int64)>

In [25]:
vectorizer_text = vectorizer(X.values)

In [28]:
vectorizer_text

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  643,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2506, ...,     0,     0,     0],
       [  425,   440,    70, ...,     0,     0,     0],
       ...,
       [32141,  7329,   383, ...,     0,     0,     0],
       [    5,    12,   533, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]], dtype=int64)>

In [29]:
# MCSHBAP - map, cache, shuffle, batch, prefetch from_tensor_slices, list_file
dataset = tf.data.Dataset.from_tensor_slices((vectorizer_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) # Helps prevent bottlenecks

In [30]:
dataset.as_numpy_iterator().next()

(array([[    12,    533,     52, ...,      0,      0,      0],
        [   283,   1896,    572, ...,      0,      0,      0],
        [   124,      7,     13, ...,      0,      0,      0],
        ...,
        [    94,     13,     42, ...,      0,      0,      0],
        [     8,     19,      1, ...,      0,      0,      0],
        [    88, 146354,    347, ...,      0,      0,      0]], dtype=int64),
 array([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 1, 0, 1, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]], dtype=int64))

In [31]:
train = dataset.take(int(len(dataset)*.7))
validation = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [34]:
train_generator = train.as_numpy_iterator()

In [35]:
train_generator.next()

(array([[  46,  456,   76, ...,    0,    0,    0],
        [1394, 1445, 1778, ...,    0,    0,    0],
        [1941,  114,  428, ...,    0,    0,    0],
        ...,
        [  61,  119,    3, ...,    0,    0,    0],
        [5265,   43, 1562, ...,    0,    0,    0],
        [   2, 4116,   13, ...,    0,    0,    0]], dtype=int64),
 array([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]], dtype=int64))

# Build a Sequential Model

In [39]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [40]:
model = Sequential()
model.add(Embedding(MAX_FEATURES+1, 32))
model.add(Bidirectional(LSTM(32, activation='tanh')))
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(6, activation='sigmoid'))

In [41]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [42]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [45]:
history = model.fit(train, epochs=30, validation_data=validation)

Epoch 1/30
  51/6981 [..............................] - ETA: 2:29:07 - loss: 0.1391

KeyboardInterrupt: 

In [None]:
from matplotlib import pyplot as plt

In [None]:
plt.figure(figsize=(8,5))
pd.DataFrame(history.history).plot()
plt.show()

# Make Predictions

In [None]:
input_text = vectorizer('You are a shitty person!')

In [None]:
batch_X, batch_y = test.as_numpy_iterator().next()

In [None]:
(model.predict(batch_X) > 0.5).astype(int)

In [None]:
result = model.predict(batch_X)

# Evaluate Model

In [None]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [None]:
precision = Precision()
recall = Recall()
accuracy = CategoricalAccuracy()

In [None]:
for batch in test.as_numpy_iterator():
    #Unpack the batch
    X_true, y_true = batch
    #Make a prediction
    yhat = model.predict(X_true)
    
    #Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    precision.update_state(y_true, yhat)
    recall.update_state(y_true, yhat)
    accuracy.update_state(y_true, yhat)

In [None]:
print(f'Precision: {precision.result().numpy()}, Recall: {recall.result().numpy()}, Accuracy: {accuracy.result().numpy()}')

# Test and Gradio

In [None]:
!pip install gradio jinja2

In [None]:
import gradio as gr

In [None]:
model.save('toxicity.h5')

In [None]:
model = tf.keras.models.load_model('toxicity.h5')

In [None]:
input_str = vectorizer('I fucking hate you, I will kill you!')

In [None]:
result = model.predict(np.expand_dims(input_str,0)

In [None]:
df.columns[2:]

In [None]:
res

In [None]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    
    text=''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)
        
    return text 

In [None]:
interface = gr.Interface(fn = score_comment, inputs=gr.inputs.Textbox(lines=2, placeholder='Comment to score') , outputs='text')

In [None]:
interface.launch(share = True)