In [None]:
import pandas as pd
import torch
import ast
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from pathlib import Path


In [59]:
input_file = "../data/toxic_comments_cleaned.csv"
output_file = "../data/processed.csv"
vocab_file = '../data/vocab.csv'

In [60]:
df = pd.read_csv(output_file)
vocab = pd.read_csv(vocab_file)
vocab = dict(zip(vocab['word'], vocab['id']))

In [61]:
df.head(4)

Unnamed: 0,id,comment_text,target,cleaned_comment,tokenized_comment,sequence
0,0000997932d777bf,explanation why the edits made under my userna...,0.0,explan edit made usernam hardcor metallica fan...,"['explan', 'edit', 'made', 'usernam', 'hardcor...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
1,000103f0d9cfb60f,daww he matches this background colour im seem...,0.0,daww match background colour im seemingli stuc...,"['daww', 'match', 'background', 'colour', 'im'...","[27, 28, 29, 30, 25, 31, 32, 33, 22, 34, 35, 3..."
2,000113f07ec002fd,hey man im really not trying to edit war its j...,0.0,hey man im realli tri edit war guy constantli ...,"['hey', 'man', 'im', 'realli', 'tri', 'edit', ...","[38, 39, 25, 40, 41, 2, 42, 43, 44, 20, 45, 46..."
3,0001b41b1c6bb37e,more i cant make any real suggestions on impr...,0.0,cant make real suggest improv wonder section s...,"['cant', 'make', 'real', 'suggest', 'improv', ...","[53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 6..."


In [62]:
vocab["[PAD]"]

0

In [63]:

# Get the current working directory
BASE_DIR = Path().resolve().parent

# Construct relative path to the data file
data_path = BASE_DIR / "data" / "processed.csv"

# Load the DataFrame
df = pd.read_csv(data_path)


# Load dataset
df['sequence'] = df['sequence'].apply(ast.literal_eval)
df['target'] = df['target'].astype(int)

# Split

def train_val_test_split(df, train_size=0.8, val_size=0.1, test_size=0.1, random_state=42):
    assert abs(train_size + val_size + test_size - 1.0) < 1e-5, "Splits must add to 1"
    
    train_val, test = train_test_split(df, test_size=test_size, random_state=random_state)
    train, val = train_test_split(train_val, test_size=val_size / (train_size + val_size), random_state=random_state)
    return train, val, test

train_data, val_data, test_data = train_val_test_split(df, train_size=0.8, val_size=0.1, test_size=0.1)



In [64]:
class TextDataset(Dataset):
    def __init__(self, dataframe, pad_len=100, pad_value=0):
        self.sequences = dataframe['sequence'].apply(
            lambda x: x[:pad_len] + [pad_value] * (pad_len - len(x)) if len(x) < pad_len else x[:pad_len] # padding
        ).tolist()
        self.labels = dataframe['target'].tolist()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx): # conversion to tensor
        return torch.tensor(self.sequences[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.float32)


In [65]:
PAD_IDX = vocab["[PAD]"]
MAXLEN = 100

train_dataset = TextDataset(train_data, pad_len=MAXLEN, pad_value=PAD_IDX)
val_dataset   = TextDataset(val_data, pad_len=MAXLEN, pad_value=PAD_IDX)
test_dataset  = TextDataset(test_data, pad_len=MAXLEN, pad_value=PAD_IDX)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=32)
test_loader  = DataLoader(test_dataset, batch_size=32)


In [66]:
### To test how padding is applied


# Select a few examples to inspect
sample_seqs = train_data['sequence'].iloc[:3].tolist()

# Function to pad
def pad_sequence(seq, pad_len, pad_value=PAD_IDX):
    if len(seq) < pad_len:
        return seq + [pad_value] * (pad_len - len(seq))
    else:
        return seq[:pad_len]

# Show comparison
for i, seq in enumerate(sample_seqs):
    padded = pad_sequence(seq, MAXLEN, pad_value=PAD_IDX)
    print(f"🔹 Original (len={len(seq)}): {seq}")
    print(f"🔸 Padded   (len={len(padded)}): {padded}")
    print("-" * 80)


🔹 Original (len=5): [1694, 1764, 10598, 590, 6806]
🔸 Padded   (len=100): [1694, 1764, 10598, 590, 6806, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
--------------------------------------------------------------------------------
🔹 Original (len=23): [1035, 2010, 323, 65, 95, 265, 896, 903, 79, 611, 977, 325, 2010, 1035, 3378, 260, 877, 457, 755, 23, 68, 113, 1094]
🔸 Padded   (len=100): [1035, 2010, 323, 65, 95, 265, 896, 903, 79, 611, 977, 325, 2010, 1035, 3378, 260, 877, 457, 755, 23, 68, 113, 1094, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
----------------------

In [67]:
import torch.nn as nn
import torch.nn.functional as F

class LightweightDNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(LightweightDNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=PAD_IDX)
        self.fc1 = nn.Linear(embedding_dim, 16)
        self.fc2 = nn.Linear(16, 1)

    def forward(self, x):
        embedded = self.embedding(x)              # [batch_size, seq_len, embed_dim]
        pooled = embedded.mean(dim=1)             # Global average pooling
        x = F.relu(self.fc1(pooled))
        return torch.sigmoid(self.fc2(x)).squeeze(1)

In [68]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LightweightDNN(vocab_size=len(vocab), embedding_dim=64).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCELoss()

for epoch in range(5):
    model.train()
    total_loss = 0
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        optimizer.zero_grad()
        out = model(batch_x)
        loss = criterion(out, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")


Epoch 1, Loss: 49.7449
Epoch 2, Loss: 33.8140
Epoch 3, Loss: 24.9441
Epoch 4, Loss: 22.7092
Epoch 5, Loss: 21.2391
