# Chat Toxicity

### Libraries

In [1]:
# basic libraries
import numpy as np
import pandas as pd
import os
# For plots
from matplotlib import pyplot as plt
from tensorflow.data import Dataset,AUTOTUNE
from tensorflow.keras.layers import TextVectorization

### Config

In [2]:
TokenLimit=200000
SentenceLimit=2000
# Dataset Variables
ShuffleParameter=100000
BatchSize=16
# DatasetPrefetch=AUTOTUNE
# Creating Sequential model
LSTM_NEURONS=32
Dense1Neurons=128
Dense2Neurons=256
Dense3Neurons=128
OutputLayer=6

### Importing dataset

In [3]:
df=pd.read_csv('train.csv')

In [4]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


### Looking into the data

In [7]:
df[df['toxic']==1]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
12,0005c987bdfc9d4b,Hey... what is it..\n@ | talk .\nWhat is it......,1,0,0,0,0,0
16,0007e25b2121310b,"Bye! \n\nDon't look, come or think of comming ...",1,0,0,0,0,0
42,001810bf8c45bf5f,You are gay or antisemmitian? \n\nArchangel WH...,1,0,1,0,1,1
43,00190820581d90ce,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",1,0,1,0,1,0
...,...,...,...,...,...,...,...,...
159494,fef4cf7ba0012866,"""\n\n our previous conversation \n\nyou fuckin...",1,0,1,0,1,1
159514,ff39a2895fc3b40e,YOU ARE A MISCHIEVIOUS PUBIC HAIR,1,0,0,0,1,0
159541,ffa33d3122b599d6,Your absurd edits \n\nYour absurd edits on gre...,1,0,1,0,1,0
159546,ffb47123b2d82762,"""\n\nHey listen don't you ever!!!! Delete my e...",1,0,0,0,1,0


In [6]:
df.shape

(159571, 8)

In [10]:
df.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

In [11]:
df.tail()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0
159570,fff46fc426af1f9a,"""\nAnd ... I really don't think you understand...",0,0,0,0,0,0


### Preprocessing The Data

#### Spliting Data and renaming a column

In [12]:
df.rename(columns = {'obscene':'sexually_explicit'}, inplace = True) 

In [13]:
xData=df['comment_text']
yData=df[df.columns[2:]]

In [14]:
yData.head()

Unnamed: 0,toxic,severe_toxic,sexually_explicit,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0


#### Text Tokenization

In [15]:
vectorize=TextVectorization(max_tokens=TokenLimit,
                            output_mode='int',
                            output_sequence_length=SentenceLimit)

In [16]:
vectorize.adapt(xData)

In [17]:
vectorizedData=vectorize(xData)

#### Creating Dataset

In [18]:
# Mapping not required as its alreadt vectorized
dataset = Dataset.from_tensor_slices((vectorizedData, yData))

# Caching data on each epoch for faster training
dataset = dataset.cache()

# Shuffling data to avoid biased results -> Need to call After caching
dataset = dataset.shuffle(ShuffleParameter)

# Creating multiple batches for training
dataset = dataset.batch(BatchSize)

# Fetching batches and keeping them ready for the GPU to train
# Hastens the process
dataset = dataset.prefetch(AUTOTUNE)

#### Creating Test set , training set and validation set

In [29]:
trainSize=int((len(dataset)*70)/100)
testSize=int((len(dataset)*20)/100)
valSize=int((len(dataset)*10)/100)

In [30]:
trainData=dataset.take(trainSize)
testData=dataset.skip(trainSize).take(testSize)
valData=dataset.skip(trainSize+testSize).take(valSize)

### Creating Model

In [31]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [32]:
model = Sequential([
    Embedding(TokenLimit+1, 32),
    Bidirectional(LSTM(LSTM_NEURONS, activation='tanh')),
    Dense(Dense1Neurons, activation='relu'),
    Dense(Dense2Neurons, activation='relu'),
    Dense(Dense3Neurons, activation='relu'),
    Dense(OutputLayer, activation='sigmoid')
])

#### Training the model

In [33]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [None]:
history = model.fit(trainData, epochs=6, validation_data=valData)

Epoch 1/6
 201/6981 [..............................] - ETA: 3:59:24 - loss: 0.1774

In [None]:
plt.figure(figsize=(8,5))
pd.DataFrame(history.history).plot()
plt.show()