# CSC413 Project: Toxicity Detector

In [None]:
import matplotlib
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models, torchvision.datasets
from torch.utils.data import DataLoader, TensorDataset

## Data

We will begin by downloading the data onto Google Colab from kaggle. The data is retrieved from https://www.kaggle.com/datasets/fizzbuzz/cleaned-toxic-comments

In [None]:
!pip install kaggle



In [None]:
from google.colab import files
files.upload() # Upload kaggle.json file

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"christiantabbah","key":"67b4d52227582cb20b4b772c28090d0c"}'}

In [None]:
! mkdir ~/.kaggle

In [None]:
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle datasets download -d fizzbuzz/cleaned-toxic-comments
! unzip cleaned-toxic-comments.zip

Downloading cleaned-toxic-comments.zip to /content
 82% 36.0M/43.7M [00:00<00:00, 127MB/s]
100% 43.7M/43.7M [00:00<00:00, 97.1MB/s]
Archive:  cleaned-toxic-comments.zip
  inflating: test_preprocessed.csv   
  inflating: train_preprocessed.csv  


## Data Analysis and Processing
We will analyze the type and amount of data that we will work with and introduce a validation set.

In [None]:
import pandas as pd
df_train = pd.read_csv('/content/train_preprocessed.csv')
df_test = pd.read_csv('/content/test_preprocessed.csv')

# Randomly split the train set into a 15% validation and 85% train set
df_val = df_train.sample(n=int(len(df_train) * 0.15))
df_train = df_train.drop(df_val.index)

# First few samples in the data set of each toxicity level
print(df_train.head())
print(df_train[df_train['toxic']==1.0].head())
print(df_train[df_train['threat']==1.0].head())
print(df_train[df_train['severe_toxic']==1.0].head())

                                        comment_text                id  \
0  explanation why the edits made under my userna...  0000997932d777bf   
1  d aww  he matches this background colour i m s...  000103f0d9cfb60f   
2  hey man  i m really not trying to edit war  it...  000113f07ec002fd   
3    more i can t make any real suggestions on im...  0001b41b1c6bb37e   
4  you  sir  are my hero  any chance you remember...  0001d958c54c6e35   

   identity_hate  insult  obscene    set  severe_toxic  threat  toxic  \
0            0.0     0.0      0.0  train           0.0     0.0    0.0   
1            0.0     0.0      0.0  train           0.0     0.0    0.0   
2            0.0     0.0      0.0  train           0.0     0.0    0.0   
3            0.0     0.0      0.0  train           0.0     0.0    0.0   
4            0.0     0.0      0.0  train           0.0     0.0    0.0   

   toxicity  
0       0.0  
1       0.0  
2       0.0  
3       0.0  
4       0.0  
                                

In [None]:
training_samples = len(df_train)
val_samples = len(df_val)
testing_samples = len(df_test)
print(f'Train samples: {training_samples}')
print(f'Validation samples: {val_samples}')
print(f'Test samples: {testing_samples}')

Train samples: 135636
Validation samples: 23935
Test samples: 153164


Now we will tokenize and prepare the word embedding:

In [None]:
def tokenize_text(text):
    # Check if text is not NaN
    if isinstance(text, str):
        return text.split()
    else:
        return []

def convert_indices(data, vocab):
    result = []
    for row in data:
        text = row.comment_text
        label = row.toxicity
        words = tokenize_text(text)
        indices = [vocab[word] for word in words if word in vocab]
        result.append((indices, label))
    return result

Create vocab:

In [None]:
vocab = {}
for text in df_train['comment_text']:
    words = tokenize_text(text)
    for word in words:
        if word not in vocab:
            vocab[word] = len(vocab)
print(vocab)

# Convert data to word indices
train_data_indices = convert_indices(df_train.itertuples(index=False), vocab)
val_data_indices = convert_indices(df_val.itertuples(index=False), vocab)
test_data_indices = convert_indices(df_test.itertuples(index=False), vocab)

max_seq_length = 200
pad_sequence = nn.utils.rnn.pad_sequence



In [40]:
# The following code proves that the longest sequences are super long.
# this longest sequence will define the rest of the shapes, as
# pad_sequence pads with 0s, to the length of the longest sequence.

lengths = [len(item[0]) for item in val_data_indices]

# Sort indices based on lengths
sorted_indices = sorted(enumerate(lengths), key=lambda x: x[1], reverse=True)

# Extract the indices of the top 3 longest sequences
top3_indices = [val_data_indices[i[0]] for i in sorted_indices[:3]]

# Print the lengths of the top 3 longest sequences
print("Top 3 longest sequences:")
for indices, label in top3_indices:
    print(f"Length: {len(indices)}, Label: {label}")


# Pad sequences to a common length
train_indices_padded = pad_sequence([torch.tensor(item[0]) for item in train_data_indices], batch_first=True)
val_indices_padded = pad_sequence([torch.tensor(item[0]) for item in val_data_indices], batch_first=True)
test_indices_padded = pad_sequence([torch.tensor(item[0]) for item in test_data_indices], batch_first=True)
print(val_indices_padded.shape)
def print_indices_in_sequence(sequence):
    for index in sequence[0]:
        print(index.item())

print_indices_in_sequence(val_indices_padded)

# Convert labels to tensors
train_labels = torch.tensor([item[1] for item in train_data_indices])
val_labels = torch.tensor([item[1] for item in val_data_indices])
test_labels = torch.tensor([item[1] for item in test_data_indices])

def print_label_counts(labels):
    unique_labels, label_counts = torch.unique(labels, return_counts=True)

    print("Label Counts:")
    for label, count in zip(unique_labels, label_counts):
        print(f"Label {int(label)}: {count} samples")

print_label_counts(val_labels)

Top 3 longest sequences:
Length: 1247, Label: 4.0
Length: 1112, Label: 4.0
Length: 1104, Label: 4.0
torch.Size([23935, 1247])
92
2
12116
1006
15
1411
140
463
464
99
56
286
4516
111
5782
125
3508
63
3530
2
1183
224
63
111
99
1080
13456
34646
4516
19
105
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

In [41]:

# Create TensorDataset
train_dataset = TensorDataset(train_indices_padded, train_labels)
val_dataset = TensorDataset(val_indices_padded, val_labels)
test_dataset = TensorDataset(test_indices_padded, test_labels)
print("Validation Dataset:")
print("Number of Samples:", len(val_dataset))
print("Shape Example:", val_dataset[0][0])
print("Label Example:", val_dataset[0][1])

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)
test_loader = DataLoader(test_dataset, batch_size=64)

class LSTMToxicityModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_classes):
        super(LSTMToxicityModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, X):
        embedded = self.embedding(X)
        lstm_out, _ = self.lstm(embedded)
        last_hidden_state = lstm_out[:, -1, :]
        output = self.fc(last_hidden_state)
        return output

Validation Dataset:
Number of Samples: 23935
Shape Example: tensor([   92,     2, 12116,  ...,     0,     0,     0])
Label Example: tensor(0.)
