
# Data preparation for Deep learning Model:


In [1]:
import os
import numpy as np

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import PIL
import sys

import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder



Mounting the cleaned data from the google drive:

In [2]:
path = '/content/drive/MyDrive/Colab Notebooks/clean_df.csv'
df = pd.read_csv(path)

In [3]:
df

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,fuck wolfkeep talk,1,1,1,0,1,0
1,user talk png crusad bot hate mindless bot min...,0,0,0,0,0,0
2,probabl anoth station name ridgewood go check one,0,0,0,0,0,0
3,next time happen report eat children rarghhhh ...,1,1,1,1,1,0
4,new accord japanes ladi name kama chinen recen...,0,0,0,0,0,0
...,...,...,...,...,...,...,...
31220,correctionperson correct vandal blood libel ac...,1,0,0,0,0,0
31221,septemb hello welcom wikipedia hope seem unfri...,0,0,0,0,0,0
31222,welcom hello welcom wikipedia thank contribut ...,0,0,0,0,0,0
31223,fuck touch edit,1,0,1,0,0,0


In [4]:
df.shape

(31225, 7)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31225 entries, 0 to 31224
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   comment_text   31220 non-null  object
 1   toxic          31225 non-null  int64 
 2   severe_toxic   31225 non-null  int64 
 3   obscene        31225 non-null  int64 
 4   threat         31225 non-null  int64 
 5   insult         31225 non-null  int64 
 6   identity_hate  31225 non-null  int64 
dtypes: int64(6), object(1)
memory usage: 1.7+ MB


Double checking for Empty/Nan comments after the Pre-cleaning and Pre-processing

In [6]:
# The results show there is Nan comments
df.isnull().values.any()

True

In [7]:
index=df[df['comment_text'].isnull()].index

In [8]:
len(index)

5

In [9]:
sample=np.random.choice(index,5)

for i in sample:
    print(i,"- index data after pre-processing-->",df['comment_text'].values[i])
    print("="*100)

15152 - index data after pre-processing--> nan
15152 - index data after pre-processing--> nan
4356 - index data after pre-processing--> nan
15152 - index data after pre-processing--> nan
14338 - index data after pre-processing--> nan


Noticed that during pre-processing after all the unwanted data has been cleaned ,there was no data remains in certain rows of the  'comments_ text' columns. And it resulting in Nan value.

Therefore need to clean the Data again to remove the empty values.


In [10]:
# Elimination of Nan/empty data
df.dropna(inplace=True)

In [11]:
df.isnull().values.any()

False

In [12]:
df.shape

(31220, 7)

In [13]:
df.shape

(31220, 7)

In [14]:
X=df['comment_text']
y=df.drop(columns=['comment_text'])

# Vectorization of the text data

In [15]:
from tensorflow.keras.layers import TextVectorization

In [16]:
max_word = 200000

In [17]:
# Taking words and converting to intiger
vectorizer = TextVectorization(max_tokens= max_word,output_sequence_length=1800,output_mode='int')

In [18]:
#For the machine too learn all the words in our data set
vectorizer.adapt(X.values)

In [19]:
vectorizer.get_vocabulary()

['',
 '[UNK]',
 'fuck',
 'articl',
 'wikipedia',
 'page',
 'like',
 'edit',
 'talk',
 'go',
 'one',
 'suck',
 'use',
 'know',
 'get',
 'would',
 'pleas',
 'delet',
 'peopl',
 'shit',
 'think',
 'make',
 'user',
 'block',
 'nigger',
 'see',
 'time',
 'u',
 'thank',
 'ass',
 'sourc',
 'say',
 'want',
 'also',
 'hate',
 'faggot',
 'person',
 'hi',
 'need',
 'even',
 'vandal',
 'remov',
 'gay',
 'die',
 'bitch',
 'look',
 'name',
 'wiki',
 'stop',
 'good',
 'work',
 'right',
 'link',
 'take',
 'thing',
 'well',
 'tri',
 'comment',
 'way',
 'moron',
 'inform',
 'fat',
 'chang',
 'may',
 'discuss',
 'cunt',
 'read',
 'call',
 'realli',
 'help',
 'imag',
 'jew',
 'editor',
 'admin',
 'fact',
 'revert',
 'refer',
 'come',
 'could',
 'back',
 'stupid',
 'new',
 'ad',
 'point',
 'reason',
 'post',
 'dick',
 'first',
 'mean',
 'much',
 'seem',
 'give',
 'mani',
 'made',
 'life',
 'section',
 'state',
 'question',
 'list',
 'find',
 'still',
 'pig',
 'said',
 'love',
 'littl',
 'wp',
 'place',
 'y

In [20]:
# To show example of the vectorization
# Numeric reprentation of the word
vectorizer ('Final project , lets make it fun')

<tf.Tensor: shape=(1800,), dtype=int64, numpy=array([508, 401,   1, ...,   0,   0,   0])>

In [21]:
#Tokenizing every single word
vectorized_text = vectorizer(X.values)

In [22]:
#Total of 31220 sentences vectorize,capped at 1800 words
vectorized_text

<tf.Tensor: shape=(31220, 1800), dtype=int64, numpy=
array([[    2, 10801,     8, ...,     0,     0,     0],
       [   22,     8,  2072, ...,     0,     0,     0],
       [  253,   156,  1540, ...,     0,     0,     0],
       ...,
       [  244,   294,   244, ...,     0,     0,     0],
       [    2,  1192,     7, ...,     0,     0,     0],
       [ 1514, 29004,    15, ...,     0,     0,     0]])>

In [23]:
# Data pipeline steps
#MCSHBAP - map, chache, shuffle, batch, prefetch to build data pipeline from_tensor_slices
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)# helps prevent bottlenecks

##Splitting Data Set for training, validation and testing

In [24]:
# 70% training data, 20% validation and 10 % test
train = dataset.take(int(len(dataset)*.7)) # specify the num of batches
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2)) # will skip the 1st 70%
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1)) # will skip the 1st 9


In [25]:
#Total data divide by 16 batch will get 9974 batches in total
len(dataset)


1952

In [26]:
# total train batches
len(train)

1366

In [27]:
# total validation batches
len (val)

390

In [28]:
# total of test batches
len(test)

195

For Deep Learning Machine Model : It will pass thru a batch into into forward pass ,it will them
go and do a backward pass, and it will update the gradients and will follow with the next batch


In [29]:
# create a generator and .next get the next batch
# everytime we run this code ,it will fetch the next batch
train_generator = train.as_numpy_iterator()
train_generator.next()

(array([[   86,   219,   219, ...,     0,     0,     0],
        [ 1246,    20,    58, ...,     0,     0,     0],
        [    2,   779,    14, ...,     0,     0,     0],
        ...,
        [21343,    44,     9, ...,     0,     0,     0],
        [  222,   236,    13, ...,     0,     0,     0],
        [   62,  3188,  3967, ...,     0,     0,     0]]),
 array([[1, 0, 0, 0, 1, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 1, 1, 0, 1, 0],
        [1, 0, 1, 0, 1, 0],
        [1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 1, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 1, 0, 1, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 1, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]]))

# Creating Sequential Deep Learning Model

In [30]:
# Download Sequential model
from tensorflow.keras.models import Sequential
# Layers to build deep neural networks
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [31]:
model = Sequential()
# Create the embedding layer
model.add(Embedding(max_word+1, 32))
# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer
model.add(Dense(6, activation='sigmoid'))

In [32]:
optimizer=keras.optimizers.Adam(learning_rate=0.003)

In [33]:
model.compile(loss='BinaryCrossentropy', optimizer= optimizer , metrics = ['accuracy'])

In [34]:
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [38]:
# Had to limit the epochs as GPU is not available for better training of model.
# We can achive a better accuracy level by increasing the epochs number.
history = model.fit(train, epochs=2, validation_data=val)

Epoch 1/2
Epoch 2/2


# Testing out the model with new inputs:

In [41]:
# Testing out the model. Test 1:
input_text = vectorizer('You freaking suck! I am going to hit you')

In [42]:
df.columns[1:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [43]:
result = model.predict(np.expand_dims(input_text , 0 ))




In [44]:
(result > 0.5).astype(int)


array([[1, 0, 1, 0, 1, 0]])

In [45]:
# Test 2:
input_text = vectorizer ('Congratulations! That was a great performance')

result = model.predict(np.expand_dims(input_text , 0 ))
(result > 0.5).astype(int)



array([[0, 0, 0, 0, 0, 0]])

In [69]:
# Test 3:
# Its purely for demonstration purposes,to test if  the model able to recognize 'identity_hate' (sorry)
input_text = vectorizer (' back off you nigger')

result = model.predict(np.expand_dims(input_text , 0 ))
(result > 0.5).astype(int)





array([[1, 0, 1, 0, 1, 1]])

In [50]:
#Download Library to save the model
from tensorflow.keras.models import load_model

In [53]:
# Download the model file so it can be just tested in the future without the need of running it again
model.save('/content/drive/MyDrive/Trained_Model/toxic_trained.h5')