Importing Dependencies

In [1]:
import os 
import pandas as pd
import tensorflow as tf
import numpy as np

In [2]:
# using panda to read and interpret the csv containing the training data
dataframe = pd.read_csv(os.path.join('jigsaw-toxic-comment-classification-challenge','train.csv', 'train.csv' ))

In [3]:
dataframe

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [4]:
dataframe.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

Preprocessing and Setting up Text Vectorisation Model

In [6]:
from tensorflow.keras.layers import TextVectorization

In [7]:
# splitting the data into raw data and labels (x and y respectively)
x = dataframe['comment_text']
y= dataframe[dataframe.columns[2:]].values

In [8]:
# checking the values of the data
x

0         Explanation\nWhy the edits made under my usern...
1         D'aww! He matches this background colour I'm s...
2         Hey man, I'm really not trying to edit war. It...
3         "\nMore\nI can't make any real suggestions on ...
4         You, sir, are my hero. Any chance you remember...
                                ...                        
159566    ":::::And for the second time of asking, when ...
159567    You should be ashamed of yourself \n\nThat is ...
159568    Spitzer \n\nUmm, theres no actual article for ...
159569    And it looks like it was actually you who put ...
159570    "\nAnd ... I really don't think you understand...
Name: comment_text, Length: 159571, dtype: object

In [9]:
y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

In [10]:
# setting the max features (maximun number of words the text vectorizer stores in its 'dictionary')
MAX_FEATURES = 200000 

In [11]:
# initialising the text vectorizer
vectorizer = TextVectorization(max_tokens=MAX_FEATURES, output_sequence_length=1800, output_mode='int')

In [12]:
# adapting the text vectorizer to data 
# changing the format of the data from pandas to a nd array that tensorflow can read
vectorizer.adapt(x.values)

In [13]:
# setting a variable for the vectorized text
vectorized_text = vectorizer(x.values)

In [14]:
# testing  if the vectorizer works
vectorized_text

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  643,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2506, ...,     0,     0,     0],
       [  425,   440,    70, ...,     0,     0,     0],
       ...,
       [32141,  7329,   383, ...,     0,     0,     0],
       [    5,    12,   533, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]], dtype=int64)>

Loading the data into tensorflow

In [15]:
# The general rule of loading data (Note the map is excluded as that is completed in the previous part)
#MCSHBAP map, cache, shuffle, batch, prefetch

In [16]:
# combining the raw data and labels to a single nd array
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
# caching the data to save time and space
dataset = dataset.cache()
# shuffling the data
dataset = dataset.shuffle(160000)
# batching and prefetching the data
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)

In [17]:
# using python unpacking to individually check the data and labels (batch_x and batch_y)
# the functions (as_numpy_iterator()) and (next()) are used to cycle through the tensorlfow dataset

batch_x, batch_y = dataset.as_numpy_iterator().next()

In [18]:
# splitting the data into training, testing and validation partitions (the ratio of 7:2:1 is relatively common among data science applications)
train = dataset.take(int(len(dataset)*0.7))
val = dataset.skip(int(len(dataset)*0.7)).take(int(len(dataset)*0.2))
test = dataset.take(int(len(dataset)*0.9)).take(int(len(dataset)*0.1))

Checking the size of partitions

In [19]:
print(f'Length of train is:{len(train)}, Length of validation is:{len(val)}, Length of test is:{len(test)}')

Length of train is:6981, Length of validation is:1994, Length of test is:997


In [20]:
# checking the values of the training data
train.as_numpy_iterator().next()

(array([[  1204,      5,     14, ...,      0,      0,      0],
        [    46,     33,     15, ...,      0,      0,      0],
        [     2,    114,    842, ...,      0,      0,      0],
        ...,
        [    45,      3,  11990, ...,      0,      0,      0],
        [     9, 143340,   1749, ...,      0,      0,      0],
        [ 95576,   2797,  25699, ...,      0,      0,      0]], dtype=int64),
 array([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 1, 0, 1, 0]], dtype=int64))

Training the Model

In [23]:
# importing the neccesary models from tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [24]:
# setting the nature of the model to be sequential (from top to down)
model = Sequential()
# adding an empedding layer that extracts information from the vectorized text and tries to interprate it
model.add(Embedding(MAX_FEATURES+1, 32))
# using a bidirectional layer to provide context to the neural network ('tanh' activation is used purely because it is the only way to maintain GPU computation support)
model.add(Bidirectional(LSTM(32, activation='tanh')))
# adding dense (fully connected layers) to classify and label the data
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# a final dense layer with 6 outputs (the number of classes of labels) is used  (note the activation is sigmoid which translates numerical outputs of any output to between 0 and 1)
model.add(Dense(6, activation='sigmoid'))

In [25]:
# compiling the model and defining loss and optimizer
# although there are multiple ouputs each output is still binary and hence 'BinaryCrossentropy' is used as opposed to  ('CategoricalCrossentropy') or ('SparseCategoricalcrossentropy')
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [26]:
# viewing the final model 
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 32)          6400032   
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               16640     
 nal)                                                            
                                                                 
 dense_4 (Dense)             (None, 128)               8320      
                                                                 
 dense_5 (Dense)             (None, 256)               33024     
                                                                 
 dense_6 (Dense)             (None, 128)               32896     
                                                                 
 dense_7 (Dense)             (None, 6)                 774       
                                                      

: 

In [None]:
# using model.fit to train the model and specifying the training data, epochs, and validation data
history = model.fit(train, epochs=10, validation_data=val)

Assessing the Model Performance

In [1]:
# importing matplotlib to visualise training performance
import matplotlib.pyplot as plt

In [2]:
# plotting the training loss and validation loss 
plt.plot(history.history['loss'], color='blue', label='loss')
plt.plot(history.history['val_loss'], color='red', label='val_loss')
plt.title('Loss')
plt.legend()
plt.show()

NameError: name 'history' is not defined

In [None]:
input_text = vectorizer("you freaking suck")

In [None]:
# the first dimension to the model is technically None (as we did not specify the number of items we are feeding it)
# hence np.expand_dims is used to create that initial None dimension
res = model.predict(np.expand_dims(input_text, 0))

In [None]:
# checking the  lables
labels = dataframe.columns
labels[2:]

In [None]:
# viewing the batches
batch = test.as_numpy_iterator().next()

In [None]:
batch_x, batch_y = batch

In [None]:
batch_y

In [None]:
# using some notation to view the predictions as 1 or 0 

(model.predict(batch_x) > 0.5).astype(int)

In [None]:
# checking them against the true data (labels used for training)
(res > 0.5).astype(int)

In [None]:
# importing extra metrics for assessing performance
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [None]:
# calling the functions individually
precision = Precision()
recall = Recall()
accuracy = CategoricalAccuracy()

In [None]:
# looping through the batch and performing prediction on each of them
for batch in test.as_numpy_iterator():
    x_true, y_true = batch
    
    yhat = model.predict(x_true)

    y_true = y_true.flatten()
    yhat = yhat.flatten()

    # using the performance metrics to gather data on the model
    precision.update_state(y_true, yhat)
    recall.update_state(y_true, yhat)
    accuracy.update_state(y_true, yhat)

In [None]:
# printing the performance data of the model
print(f'Precision{precision.result().numpy()}, Recall:{recall.result().numpy()}, Accuracy:{accuracy.result().numpy()},')

Saving the Model and Testing Again

In [None]:
# saving the model 
model.save('toxicity.h5')

In [None]:
# importing dependency to load tensorflow models
from tensorflow.keras.models import load_model

In [None]:
# defining the model and calling it
toxicity = load_model('toxicity.h5')

In [None]:
# checking if the model is correct
toxicity.summary()

In [None]:
# performing text vectorization on a string
input_str = vectorizer('You freaking suck at this game')

In [None]:
# performing text sentiment analysis 
res = model.predict(np.expand_dims(input_str, 0))

In [None]:
# returning the values as 1 or 0 (for true or false)
(res > 0.5).astype(int)

Making an Interface for the Model using Gradio

In [None]:
# importing gradio
import gradio as gr

In [None]:
# defining function for gradio to use later
def score_comment(comment):
    # vectorizing comment
    vectorized_comment = vectorizer([comment])
    # performing text sentiment analysis on the vectorized comment
    results = model.predict(vectorized_comment)
    
    # printing the result in a more readable format
    text = ''
    for idx, col in enumerate(dataframe.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)
    
    return text

In [None]:
# initialising the gradio interface and defining the function, inputs and ouputs
interface = gr.Interface(fn=score_comment, 
                         inputs=gr.inputs.Textbox(lines=2, placeholder='Comment to score'),
                        outputs='text')

In [None]:
# launching the interface
interface.launch(share=False)

Saving vectorizer model

In [None]:
# the text  vectorizer is unique to the specific dataset and hence when using it in other programs one would have to retrain it every time.
# this is slow and resource intensive
# hence, it is more advisable to save the text vectorizer model and load it into a new program.
# creating a tensorflow model for the text vectorization consisting of an input layer and a vectorizer model
textvect = tf.keras.models.Sequential()
textvect.add(tf.keras.Input(shape=(1,), dtype=tf.string))
textvect.add(vectorizer)

In [None]:
# saving the model 
textvect.save('vectorizer.tf')

In [None]:
# loading the model
loaded_vect_model = load_model('vectorizer.tf')
vect_model = loaded_vect_model.layers[0]

In [None]:
# testing the model
print(vect_model(x.values))

In [None]:
# vectorizing the same list to check for any differences
print(vectorizer(x.values))