In [2]:
# Importing necessary libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

In [4]:
# Reading dataset using pandas
train_data = pd.read_csv('/kaggle/input/toxic-dataset/train_data.csv')
train_data_external = pd.read_csv('/kaggle/input/toxic-dataset/train_data_external.csv')
test_data = pd.read_csv('/kaggle/input/toxic-dataset/test_data.csv')

# Train dataset preparation
* Adding new external toxic and non-toxic comments
* Balanced dataset

In [5]:
# Renaming dataset columns
train_data.rename(columns={'Label': 'label', 'Text': 'comment_text'}, inplace=True)
test_data.rename(columns={'ID': 'id', 'Text': 'comment_text'}, inplace=True)

In [6]:
# Distribution of train data labels
distribution = train_data['label'].value_counts()
print(distribution)

# Dataset has only 837 toxic comments while having 6654 non-toxic comments
# Which is a huge data imbalance

label
1    6654
0     837
Name: count, dtype: int64


In [7]:
# Get more than 15 000 toxic comments from external dataset
toxic_comments = train_data_external[train_data_external['toxic'] == 1]
copied_toxic_comments = toxic_comments[['toxic', 'comment_text']].copy()
copied_toxic_comments.rename(columns={'toxic': 'label'}, inplace=True)
copied_toxic_comments['label'] = copied_toxic_comments['label'].replace({1: 0})
copied_toxic_comments.reset_index(drop=True, inplace=True)
print(copied_toxic_comments)

       label                                       comment_text
0          0       COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK
1          0  Hey... what is it..\n@ | talk .\nWhat is it......
2          0  Bye! \n\nDon't look, come or think of comming ...
3          0  You are gay or antisemmitian? \n\nArchangel WH...
4          0           FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!
...      ...                                                ...
15289      0  "\n\n our previous conversation \n\nyou fuckin...
15290      0                  YOU ARE A MISCHIEVIOUS PUBIC HAIR
15291      0  Your absurd edits \n\nYour absurd edits on gre...
15292      0  "\n\nHey listen don't you ever!!!! Delete my e...
15293      0  and i'm going to keep posting the stuff u dele...

[15294 rows x 2 columns]


In [8]:
# Merging datasets
merged_train_data = pd.concat([train_data, copied_toxic_comments], ignore_index=True)
print(merged_train_data)

       label                                       comment_text
0          1                                              why .
1          1              I still love you so much just priva .
2          1                         I wish you every success .
3          0  She may or may not be a Jew but she 's certain...
4          1  I 'm just pointing out our version of mercy an...
...      ...                                                ...
22780      0  "\n\n our previous conversation \n\nyou fuckin...
22781      0                  YOU ARE A MISCHIEVIOUS PUBIC HAIR
22782      0  Your absurd edits \n\nYour absurd edits on gre...
22783      0  "\n\nHey listen don't you ever!!!! Delete my e...
22784      0  and i'm going to keep posting the stuff u dele...

[22785 rows x 2 columns]


In [9]:
# Distribution of merged dataset
distribution_merged = merged_train_data['label'].value_counts()
print(distribution_merged)

# Dataset has more than 16 000 toxic comments while having ~6500 non-toxic comments
# We can add another 10 000 non-toxic comments to make it balanced

label
0    16131
1     6654
Name: count, dtype: int64


In [10]:
# Creating new column named as "non_toxic" : if comment doesn't belong in any of the class then "non_toxic" will be 1 else 0
train_data_external['non_toxic'] = train_data_external.iloc[:,2:8].apply(lambda x: 1 if (sum(x)==0) else 0, axis=1)
print(train_data_external)

                      id                                       comment_text  \
0       0000997932d777bf  Explanation\nWhy the edits made under my usern...   
1       000103f0d9cfb60f  D'aww! He matches this background colour I'm s...   
2       000113f07ec002fd  Hey man, I'm really not trying to edit war. It...   
3       0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...   
4       0001d958c54c6e35  You, sir, are my hero. Any chance you remember...   
...                  ...                                                ...   
159566  ffe987279560d7ff  ":::::And for the second time of asking, when ...   
159567  ffea4adeee384e90  You should be ashamed of yourself \n\nThat is ...   
159568  ffee36eab5c267c9  Spitzer \n\nUmm, theres no actual article for ...   
159569  fff125370e4aaaf3  And it looks like it was actually you who put ...   
159570  fff46fc426af1f9a  "\nAnd ... I really don't think you understand...   

        toxic  severe_toxic  obscene  threat  insul

In [11]:
# Get ~10 000 non-toxic comments from external dataset
non_toxic_comments = train_data_external[train_data_external['non_toxic'] == 1]
copied_non_toxic_comments = non_toxic_comments[['non_toxic', 'comment_text']].copy()
copied_non_toxic_comments.rename(columns={'non_toxic': 'label'}, inplace=True)
copied_non_toxic_comments = copied_non_toxic_comments.sample(n=10000, random_state=42)
copied_non_toxic_comments.reset_index(drop=True, inplace=True)
print(copied_non_toxic_comments)

      label                                       comment_text
0         1  "\n\nOh, don't worry about me, Sandstein. I'm ...
1         1               Are you trying to dispute that fact?
2         1  SWOT analysis \n\nThis source – Align Technolo...
3         1  cover \n\nso, do we want a current or older co...
4         1  P.S. It's probably worth setting up a template...
...     ...                                                ...
9995      1  Because you read it in the Splinter Cell wiki?...
9996      1  Do you have a source other than your opinion f...
9997      1            REDIRECT Talk:River Rescue (video game)
9998      1  I do not blame you. I was basically gang raped...
9999      1  "\nNancy Pelosi is a high ranking official of ...

[10000 rows x 2 columns]


In [13]:
# Merging datasets
final_train_data = pd.concat([merged_train_data, copied_non_toxic_comments], ignore_index=True)
print(final_train_data)

       label                                       comment_text
0          1                                              why .
1          1              I still love you so much just priva .
2          1                         I wish you every success .
3          0  She may or may not be a Jew but she 's certain...
4          1  I 'm just pointing out our version of mercy an...
...      ...                                                ...
32780      1  Because you read it in the Splinter Cell wiki?...
32781      1  Do you have a source other than your opinion f...
32782      1            REDIRECT Talk:River Rescue (video game)
32783      1  I do not blame you. I was basically gang raped...
32784      1  "\nNancy Pelosi is a high ranking official of ...

[32785 rows x 2 columns]


In [14]:
# Distribution of final train dataset
distribution_final = final_train_data['label'].value_counts()
print(distribution_final)

# Dataset has balanced toxic and non-toxic comments, shuffled

label
1    16654
0    16131
Name: count, dtype: int64


# Text Cleaning
* Lowercase
* Expanding contradictions
* Removing URLs
* Removing non-ASCII characters
* Removing special characters (symbols & emojis)
* Removing HTML
* Removing escape characters
* Removing punctuations and spaces which appeared more than once
* Removing stop words

In [15]:
# Copied for further text cleaning
cleaned_train_data = final_train_data.copy()
cleaned_test_data = test_data.copy()

In [16]:
cleaned_train_data.to_csv('cleaned_train_data.csv', index=False)

In [27]:
# Intalling the contractions package
!pip install contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hDownloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_6

In [28]:
import contractions
from nltk.corpus import stopwords

stop = set(stopwords.words('english'))

# Function which performs text cleaning
def clean_text(text):
    # Lowercase
    text = text.lower()
    # Expand contractions
    text = contractions.fix(text)
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    # Remove non-ASCII characters
    text = re.sub(r'[^\x00-\x7f]', '', text)
    # Remove special characters, including symbols, emojis, and other graphic characters
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    # Remove HTML
    html = re.compile(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")
    text = re.sub(html, "", text)
    # Remove escape characters
    text = re.sub(r'[\n\t\r\a]', ' ', text)
    # Replacing "" with "
    text = re.sub(r"\"\"", "\"", text)
    # Removing quotation from start and the end of the string
    text = re.sub(r"^\"", "", text)
    text = re.sub(r"\"$", "", text)
    # Removing Punctuation / Special characters (;:'".?@!%&*+) which appears more than twice in the text
    text = re.sub(r"[^a-zA-Z0-9\s][^a-zA-Z0-9\s]+", " ", text)
    # Removing Special characters 
    text = re.sub(r"[^a-zA-Z0-9\s\"\',:;?!.()]", " ", text)
    # Removing extra spaces in text
    text = re.sub(r"\s\s+", " ", text)
    # Remove stop words
    text = ' '.join(word for word in text.split() if word not in stop)
    return text

In [29]:
# Applying the clean_text function to the 'comment_text' column of the datasets
cleaned_train_data['comment_text'] = cleaned_train_data['comment_text'].apply(clean_text)
cleaned_test_data['comment_text'] = cleaned_test_data['comment_text'].apply(clean_text)

In [30]:
# Save the DataFrame to a CSV file
cleaned_train_data.to_csv('cleaned_train_data.csv', index=False)
cleaned_test_data.to_csv('cleaned_test_data.csv', index=False)

# Model Training

In [41]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizer, DistilBertModel
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch.nn as nn
from sklearn.model_selection import train_test_split

In [42]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-05

In [43]:
# Separating data into training and validation sets
train_df, val_df = train_test_split(cleaned_train_data, test_size=0.15, random_state=42)

print('Training data shape:', train_df.shape)
print('Validation data shape:', val_df.shape)

Training data shape: (27867, 2)
Validation data shape: (4918, 2)


In [44]:
class DistilBERT_Model(nn.Module):
    def __init__(self, num_labels):
        super(DistilBERT_Model, self).__init__()
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(self.distilbert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]
        pooled_output = self.dropout(pooled_output)
        logits = self.linear(pooled_output)
        return logits

In [45]:
# Creating Custom Dataset class for Toxic comments and Labels
class ToxicDataset(Dataset):
    def __init__(self, data, tokenizer, max_length, eval_mode: bool = False):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.eval_mode = eval_mode

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['comment_text']        
        if not self.eval_mode:
            label = self.data.iloc[idx]['label']
        else:
            label = 0

        # Tokenize and encode the text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(label, dtype=torch.long)
        }

In [46]:
# Initializing tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
# tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = DistilBERT_Model(num_labels=2)

In [47]:
# Defining datasets and data loaders for train, validation, and test
train_dataset = ToxicDataset(train_df, tokenizer, MAX_LEN)
val_dataset = ToxicDataset(val_df, tokenizer, MAX_LEN)
test_output_set = ToxicDataset(cleaned_test_data, tokenizer, MAX_LEN, eval_mode=True)

train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=False)
test_output_loader = DataLoader(test_output_set, batch_size=TRAIN_BATCH_SIZE, shuffle=False)

In [48]:
# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(EPOCHS):
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

    # Validation evaluation after each epoch
    model.eval()
    correct_val = 0
    total_val = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted = torch.max(outputs, 1)
            total_val += labels.size(0)
            correct_val += (predicted == labels).sum().item()

    val_accuracy = correct_val / total_val
    print(f'Epoch {epoch+1}/{EPOCHS}, Validation Accuracy: {val_accuracy:.4f}')

Epoch 1/3, Validation Accuracy: 0.9298
Epoch 2/3, Validation Accuracy: 0.9215
Epoch 3/3, Validation Accuracy: 0.9205


In [49]:
# Saving model
torch.save(model,"dsbert_toxic_balanced.pt")

In [50]:
# Evaluation on test data
model.eval()
test_predictions = []

with torch.no_grad():
    for batch in test_output_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs, 1)
        test_predictions.extend(predicted.cpu().detach().numpy())

# Convert predictions to DataFrame with 'ID' column
test_ids = cleaned_test_data['id']
predictions_df = pd.DataFrame({'ID': test_ids, 'Label': test_predictions})

# Save predictions to CSV
predictions_df.to_csv('distilbert_nn2.csv', index=False)