# Toxic comment analysis using GRU

---


## Load libraries


In [1]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd

## Load Dataset


In [3]:
data = pd.read_csv("datasets/train.csv")
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


### Clean the data


In [4]:
print("Number of missing values in each column :")
print(data.isnull().sum())

Number of missing values in each column :
id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64


We will remove the rows with missing values and remove the id column as it is not relevant.


In [5]:
data = data.dropna()
del data["id"]

Our data is now clean and ready for analysis !


In [6]:
data.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [7]:
from helpers.data.dataset import print_data_composition

print_data_composition(data)

Number of neutral comments : 143346

Number of toxic comments : 16225
| number of severe_toxic : 1595
| number of obscene : 8449
| number of threat : 478
| number of insult : 7877
| number of identity_hate : 1405


There is clearly imbalanced. The number of neutral comments is way greater than the number of toxic comments


## Rebalance the dataframe using downsampling

We remove neutral comments to balance the dataset


In [8]:
print_data_composition(data)

Number of neutral comments : 143346

Number of toxic comments : 16225
| number of severe_toxic : 1595
| number of obscene : 8449
| number of threat : 478
| number of insult : 7877
| number of identity_hate : 1405


In [9]:
# count the number of comments that have at least one label set to 1
toxic_comments = data[(data["toxic"] == 1) |
                      (data["severe_toxic"] == 1) |
                      (data["obscene"] == 1) |
                      (data["threat"] == 1) |
                      (data["insult"] == 1) |
                      (data["identity_hate"] == 1)]

toxic_comments


Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
6,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
12,Hey... what is it..\n@ | talk .\nWhat is it......,1,0,0,0,0,0
16,"Bye! \n\nDon't look, come or think of comming ...",1,0,0,0,0,0
42,You are gay or antisemmitian? \n\nArchangel WH...,1,0,1,0,1,1
43,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",1,0,1,0,1,0
...,...,...,...,...,...,...,...
159494,"""\n\n our previous conversation \n\nyou fuckin...",1,0,1,0,1,1
159514,YOU ARE A MISCHIEVIOUS PUBIC HAIR,1,0,0,0,1,0
159541,Your absurd edits \n\nYour absurd edits on gre...,1,0,1,0,1,0
159546,"""\n\nHey listen don't you ever!!!! Delete my e...",1,0,0,0,1,0


### Downsample the data


In [10]:
num_toxic_comments = len(toxic_comments)
num_neutral_comments = len(data[(data["toxic"] == 0) &
                                (data["severe_toxic"] == 0) &
                                (data["obscene"] == 0) &
                                (data["threat"] == 0) &
                                (data["insult"] == 0) &
                                (data["identity_hate"] == 0)])

num_samples = min(num_toxic_comments, num_neutral_comments)

neutral_comments = data[(data["toxic"] == 0) &
                        (data["severe_toxic"] == 0) &
                        (data["obscene"] == 0) &
                        (data["threat"] == 0) &
                        (data["insult"] == 0) &
                        (data["identity_hate"] == 0)].sample(n=num_samples * 2)

data = pd.concat([toxic_comments, neutral_comments])

print_data_composition(data)

Number of neutral comments : 32450

Number of toxic comments : 16225
| number of severe_toxic : 1595
| number of obscene : 8449
| number of threat : 478
| number of insult : 7877
| number of identity_hate : 1405


### Clean the comments


In [11]:
data["comment_text"].head()

6          COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK
12    Hey... what is it..\n@ | talk .\nWhat is it......
16    Bye! \n\nDon't look, come or think of comming ...
42    You are gay or antisemmitian? \n\nArchangel WH...
43             FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!
Name: comment_text, dtype: object

In [12]:
from helpers.data.process_comment import process_comment
from helpers.data.text_manipulation import TextManipulation

text_manipulator = TextManipulation()

comments = data["comment_text"].to_numpy()

labels_columns = [
    "toxic",
    "severe_toxic",
    "obscene",
    "threat",
    "insult",
    "identity_hate",
]
labels = data[labels_columns].to_numpy()

for i, comment in enumerate(comments):
    comments[i] = process_comment(comment)



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/louislecouturier/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/louislecouturier/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/louislecouturier/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/louislecouturier/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/louislecouturier/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/louislecouturier/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/louislecouturi

### Tokenize the comments


In [13]:
MAX_TOKENS = 1500

In [14]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=MAX_TOKENS, oov_token="<OOV>")
tokenizer.fit_on_texts(comments)

vocab_size = len(tokenizer.word_index) + 1

In [15]:
sequences = tokenizer.texts_to_sequences(comments)
comments = pad_sequences(sequences, padding="post", maxlen=MAX_TOKENS)

In [16]:
comments[:5, :10]

array([[ 593,  148,    3,    1,  282,   17,   24,  154,    0,    0],
       [ 321,   37,    8,   12,   53,   37,    8,   12,   31,    1],
       [1044,   21,   11,  162,  221,   28,   72,    9,    1,  157],
       [   3,   14,  149,   28,    1,    1,  627,    1,    1,    1],
       [  47,   20,    1,  541,   13,    2,  121,    1,    0,    0]],
      dtype=int32)

## Analyse the text


### Split the training and test data


In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(comments, labels, test_size=0.2, random_state=42)

In [18]:
print(f"Train data samples : {len(X_train)}")
print(f"Test data samples : {len(X_test)}")

Train data samples : 38940
Test data samples : 9735


In [19]:
MODEL_PATH = "models/GRU"

In [20]:
import os

os.makedirs(MODEL_PATH, exist_ok=True)

In [21]:
import joblib

with open(f"{MODEL_PATH}/vectorizer.pkl", "wb") as f:
    joblib.dump(tokenizer, f)

## Callbacks


## Build the model

### Load GloVe embeddings


In [22]:
EMBEDDING_DIM = 100
GLOVE_PATH = 'GloVe/glove.6B.100d.txt'

In [23]:
embeddings_index = {}
with open(GLOVE_PATH) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embeddings_index[word] = coefs

embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in tokenizer.word_index.items():
    if i < MAX_TOKENS:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

### CSV Logger


In [24]:
from tensorflow.keras.callbacks import CSVLogger

csv_callback = CSVLogger(
    f"{MODEL_PATH}/metrics.csv"
)

### Model checkpoint


In [25]:
from tensorflow.keras.callbacks import ModelCheckpoint

model_checkpoint_callback = ModelCheckpoint(
    filepath=f"{MODEL_PATH}/model.keras",
    monitor='val_loss',
    mode='min',
    save_best_only=True)

## Model Architecture


In [26]:
DROPOUT = 0.2

In [27]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Dense, Bidirectional, GlobalMaxPool1D, GRU, Embedding, BatchNormalization, \
    Dropout
from tensorflow.keras.initializers import Constant

number_of_classes = len(labels_columns)

model = Sequential()

model.add(Input(shape=(MAX_TOKENS,)))
model.add(Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM,
                    embeddings_initializer=Constant(embedding_matrix), trainable=False))
#
model.add(Bidirectional(GRU(EMBEDDING_DIM, return_sequences=True)))
model.add(Dropout(DROPOUT))
#
model.add(Bidirectional(GRU(EMBEDDING_DIM, return_sequences=False)))
model.add(Dropout(DROPOUT))
#
model.add(Dense(EMBEDDING_DIM, activation="relu"))
model.add(BatchNormalization())
model.add(Dropout(DROPOUT))
#
model.add(Dense(number_of_classes, activation="sigmoid"))

model.compile(optimizer="adam", metrics=["accuracy"], loss="binary_crossentropy")
model.summary()

## Model training


In [28]:
NUM_EPOCHS = 6
BATCH_SIZE = 32

In [None]:
history = model.fit(
    X_train,
    y_train,
    epochs=NUM_EPOCHS,
    batch_size=BATCH_SIZE,
    validation_split=0.2,
    callbacks=[model_checkpoint_callback, csv_callback],
)

Epoch 1/6
[1m 29/974[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m34:01[0m 2s/step - accuracy: 0.1858 - loss: 0.7771

## Evaluate the model


### Show the training history


In [None]:
from helpers.data.show_training_metrincs import show_training_metrics

show_training_metrics(f"{MODEL_PATH}/metrics.csv")

### Evaluate the model on the test data


In [None]:
evaluation = model.evaluate(X_test, y_test)
print(evaluation)

In [None]:
sentence = ["I love you so much, you are the best person in the world"]

sentence = process_comment(sentence[0])
print(sentence)

sentence = tokenizer.texts_to_sequences([sentence])
print(sentence)
sentence = pad_sequences(sentence, padding="post", maxlen=MAX_TOKENS)

print(sentence)

prediction = model.predict(sentence)
print(labels_columns)
print(prediction)
