In [None]:
import numpy as np
import pandas as pd

#data visualisation libraries
import matplotlib.pyplot as plt
import seaborn as sns
from pylab import rcParams

import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

#to avoid warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
train_df = pd.read_csv("data/raw/train.csv")
test_df = pd.read_csv("data/raw/test.csv")


In [None]:
column_labels = train_df.columns.tolist()[2:]
train_df[column_labels].sum().sort_values()

In [None]:
print(train_df['toxic'].value_counts())

In [None]:
# Split toxic and clean
train_toxic = train_df[train_df['toxic'] == 1]
train_non_toxic = train_df[train_df['toxic'] == 0]

#  Undersample clean class to match toxic count
train_non_toxic_sampled = train_non_toxic.sample(n=len(train_toxic), random_state=42)

# : Combine both and shuffle
train_balanced = pd.concat([train_toxic, train_non_toxic_sampled], axis=0)
train_balanced = train_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

#  Confirm balance
print(train_balanced['toxic'].value_counts())


In [None]:
print(train_balanced.columns)

In [None]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    train_balanced['comment_text'],  # or dataframe['comment_text'] if it's the same
    train_balanced.iloc[:, 2:],      # all 6 label columns
    test_size=0.25,
    random_state=42
)



In [None]:
# Split test set into 50% validation and 50% final test
val_texts, final_test_texts, val_labels, final_test_labels = train_test_split(
    test_texts,
    test_labels,
    test_size=0.5,
    random_state=42
)


Tokenization and Encoding

In [None]:
def tokenize_and_encode(tokenizer, comments, labels, max_length=128):
    # Tokenize the list of comments at once (batch encoding is more efficient)
    encoding = tokenizer(
        comments.tolist(),                      # Ensure it's a list of strings
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',                   # Use padding parameter instead of pad_to_max_length
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    # Convert labels to torch tensors
    labels = torch.tensor(labels.values, dtype=torch.float32)

    return encoding['input_ids'], encoding['attention_mask'], labels


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Initialize Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

#  Initialize Model (6 labels for multi-label classification)
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=6,
    problem_type="multi_label_classification"  # Important for multi-label setup
)

# Device Setup (GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

#  Tokenize and Encode Each Dataset
train_input_ids, train_attention_masks, train_labels_tensor = tokenize_and_encode(
    tokenizer, train_texts, train_labels)

val_input_ids, val_attention_masks, val_labels_tensor = tokenize_and_encode(
    tokenizer, val_texts, val_labels)

test_input_ids, test_attention_masks, test_labels_tensor = tokenize_and_encode(
    tokenizer, test_texts, test_labels)

# Check Shapes
print('Training Comments :', train_texts.shape)
print('Input Ids         :', train_input_ids.shape)
print('Attention Mask    :', train_attention_masks.shape)
print('Labels            :', train_labels_tensor.shape)


In [None]:
train_balanced.to_csv("train_balanced.csv", index=False)


Creating Pytorch Data Loaders

In [None]:
from torch.utils.data import DataLoader, TensorDataset

# Set batch size
batch_size = 32

# Train DataLoader
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Validation DataLoader
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels_tensor)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Test DataLoader
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels_tensor)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [None]:
# View one batch of training data
print('Batch Size :', train_loader.batch_size)

batch = next(iter(train_loader))

print('Each Input ids shape      :', batch[0].shape)  # (batch_size, sequence_length)
print('Input ids [0]             :\n', batch[0][0])
print('Decoded text [0]          :\n', tokenizer.decode(batch[0][0], skip_special_tokens=True))
print('Corresponding Attention Mask:\n', batch[1][0])
print('Corresponding Label [0]   :', batch[2][0])


In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)


Model Training

In [None]:
# # Function to Train the Model
# def train_model(model, train_loader, optimizer, device, num_epochs):
#     # Loop through the specified number of epochs
#     for epoch in range(num_epochs):
#         # Set the model to training mode
#         model.train()
#         # Initialize total loss for the current epoch
#         total_loss = 0

#         # Loop through the batches in the training data
#         for batch in train_loader:
#             input_ids, attention_mask, labels = [t.to(device) for t in batch]

#             optimizer.zero_grad()

#             outputs = model(
#                 input_ids, attention_mask=attention_mask, labels=labels)
#             loss = outputs.loss
#             total_loss += loss.item()

#             loss.backward()
#             optimizer.step()

#         model.eval()  # Set the model to evaluation mode
#         val_loss = 0

#         # Disable gradient computation during validation
#         with torch.no_grad():
#             for batch in val_loader:
#                 input_ids, attention_mask, labels = [
#                     t.to(device) for t in batch]

#                 outputs = model(
#                     input_ids, attention_mask=attention_mask, labels=labels)
#                 loss = outputs.loss
#                 val_loss += loss.item()
#         # Print the average loss for the current epoch
#         print(
#             f'Epoch {epoch+1}, Training Loss: {total_loss/len(train_loader)},Validation loss:{val_loss/len(val_loader)}')


# # Call the function to train the model
# train_model(model, train_loader, optimizer, device, num_epochs=3)