In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [2]:
# Load the dataset
file_path = "cleaned_train.csv"
df = pd.read_csv(file_path)

# Display the first few rows and information about the dataset
print("Dataset Preview:")
print(df.head())

print("\nDataset Info:")
print(df.info())

Dataset Preview:
                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...      0   
4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

   severe_toxic  obscene  threat  insult  identity_hate  \
0             0        0       0       0              0   
1             0        0       0       0              0   
2             0        0       0       0              0   
3             0        0       0       0              0   
4             0        0       0       0              0   

                                cleaned_comment_text  \
0  explanation why the edits made under my userna...   
1  daww he matches this backg

In [3]:
# Check if required columns exist
required_columns = ['processed_comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
missing_columns = [col for col in required_columns if col not in df.columns]

if missing_columns:
    raise ValueError(f"Missing columns in dataset: {missing_columns}")

# Check if 'processed_comment_text' contains valid tokenized data
print("\nPreview of processed_comment_text:")
print(df['processed_comment_text'].head())

# Check if labels are binary
print("\nSum of label columns (to confirm binary values):")
print(df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum())


Preview of processed_comment_text:
0    explanation edits made username hardcore metal...
1    daww match background colour im seemingly stuc...
2    hey man im really trying edit war guy constant...
3    cant make real suggestion improvement wondered...
4                  sir hero chance remember page thats
Name: processed_comment_text, dtype: object

Sum of label columns (to confirm binary values):
toxic            15294
severe_toxic      1595
obscene           8449
threat             478
insult            7877
identity_hate     1405
dtype: int64


In [4]:
from collections import Counter

# Tokenize and build vocabulary
def tokenize(texts): 
    vocab = Counter(word for text in texts for word in text.split())
    word_to_idx = {word: idx + 1 for idx, word in enumerate(vocab)}  # Reserve 0 for padding
    return [[word_to_idx[word] for word in text.split()] for text in texts], word_to_idx

# Tokenize the processed_comment_text
print("Tokenizing text...")
df['processed_comment_text'] = df['processed_comment_text'].fillna("unknown")
df['cleaned_comment_text'] = df['cleaned_comment_text'].fillna("unknown")
tokenized_texts, word_to_idx = tokenize(df['processed_comment_text'])

# Pad sequences to ensure equal length
max_seq_length = 100  # Adjust this based on your dataset or task
padded_texts = [seq[:max_seq_length] + [0] * max(0, max_seq_length - len(seq)) for seq in tokenized_texts]

# Add tokenized and padded texts back to the dataframe
df['tokenized_comment_text'] = padded_texts

print("Tokenization and padding complete!")

Tokenizing text...
Tokenization and padding complete!


In [5]:
texts = df['tokenized_comment_text'].tolist()
labels = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values.tolist()

train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

In [6]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(train_data['processed_comment_text'])
X_test_tfidf = vectorizer.transform(test_data['processed_comment_text'])

labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
metrics_lr = {}

# Train a Logistic Regression model for each label
for label in labels:
    y_train = train_data[label]
    y_test = test_data[label]

    model = LogisticRegression(max_iter=1000, random_state=42)
    model.fit(X_train_tfidf, y_train)

    y_pred = model.predict(X_test_tfidf)

    metrics_lr[label] = {
        'accuracy': accuracy_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred)
    }

# Display metrics for the baseline model
metrics_df_lr = pd.DataFrame([
    {'Class': label, **metrics_lr[label]} for label in labels
])
print("\nBaseline Logistic Regression Metrics:")
print(metrics_df_lr)


Baseline Logistic Regression Metrics:
           Class  accuracy        f1  precision    recall
0          toxic  0.957888  0.740240   0.904155  0.626636
1   severe_toxic  0.990694  0.366738   0.581081  0.267913
2        obscene  0.977283  0.751286   0.912500  0.638484
3         threat  0.997807  0.186047   0.666667  0.108108
4         insult  0.970359  0.644628   0.818702  0.531599
5  identity_hate  0.991759  0.283379   0.712329  0.176871


In [7]:
from torch.utils.data import Dataset, DataLoader
import torch

# Define PyTorch Dataset class
class ToxicCommentDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = torch.tensor(texts, dtype=torch.long)
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# Create DataLoaders
train_loader = DataLoader(ToxicCommentDataset(train_texts, train_labels), batch_size=64, shuffle=True)
val_loader = DataLoader(ToxicCommentDataset(val_texts, val_labels), batch_size=64, shuffle=False)

print("DataLoaders created successfully!")

DataLoaders created successfully!


In [None]:
import torch.nn as nn

class IndependentProbabilitiesModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.lstm(embedded)
        output = self.dropout(output)
        return torch.sigmoid(self.fc(output[:, -1, :]))  # Independent probabilities

class JointProbabilitiesModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.lstm(embedded)
        output = self.dropout(output)
        return torch.softmax(self.fc(output[:, -1, :]), dim=-1)  # Joint probabilities

In [15]:
# Parameters
vocab_size = len(word_to_idx) + 1  # Vocabulary size + 1 for padding
embed_dim = 128
hidden_dim = 256
num_layers = 2
output_dim_independent = len(labels)  # Number of labels (6)
output_dim_joint = 2 ** len(labels)  # Number of joint label configurations (64)

# Initialize models
independent_model = IndependentProbabilitiesModel(vocab_size, embed_dim, hidden_dim, output_dim_independent, num_layers)
joint_model = JointProbabilitiesModel(vocab_size, embed_dim, hidden_dim, output_dim_joint, num_layers)

# Move models to device
device = "cuda" if torch.cuda.is_available() else "cpu"
independent_model.to(device)
joint_model.to(device)

print("Models initialized and moved to device:", device)

Models initialized and moved to device: cuda


In [16]:
# Define training function for the independent model
def train_independent_model(model, data_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for texts, labels in data_loader:
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 5)

        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(data_loader)

# Define training function for the joint model
def train_joint_model(model, data_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for texts, labels in data_loader:
        texts, labels = texts.to(device), labels.to(device)
        labels = torch.argmax(labels, dim=1)  # Convert multi-label to single index
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 5)

        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(data_loader)

In [17]:
# Define training function for the independent model
def train_independent_model(model, data_loader, optimizer, criterion, device, epoch):
    model.train()
    total_loss = 0
    print(f"\nTraining Independent Probabilities Model - Epoch {epoch+1}")
    for batch_idx, (texts, labels) in enumerate(data_loader, start=1):
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 5)

        optimizer.step()
        total_loss += loss.item()

        # Print progress for every 10 batches
        if batch_idx % 10 == 0:
            print(f"Batch {batch_idx}/{len(data_loader)}, Loss: {loss.item():.4f}")
    return total_loss / len(data_loader)


# Define training function for the joint model
def train_joint_model(model, data_loader, optimizer, criterion, device, epoch):
    model.train()
    total_loss = 0
    print(f"\nTraining Joint Probability Model - Epoch {epoch+1}")
    for batch_idx, (texts, labels) in enumerate(data_loader, start=1):
        texts, labels = texts.to(device), labels.to(device)
        labels = torch.argmax(labels, dim=1)  # Convert multi-label to single index
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 5)

        optimizer.step()
        total_loss += loss.item()

        # Print progress for every 10 batches
        if batch_idx % 10 == 0:
            print(f"Batch {batch_idx}/{len(data_loader)}, Loss: {loss.item():.4f}")
    return total_loss / len(data_loader)

In [18]:
import torch.optim.lr_scheduler as lr_scheduler

# Optimizers and Loss Functions
independent_optimizer = optim.Adam(independent_model.parameters(), lr=0.001)
joint_optimizer = optim.Adam(joint_model.parameters(), lr=0.001)

epochs = 10
independent_lr = lr_scheduler.LinearLR(independent_optimizer, 1.0, 0.5, epochs)
joint_lr = lr_scheduler.LinearLR(joint_optimizer, 1.0, 0.8, epochs)

bce_loss = nn.BCELoss()  # Binary Cross-Entropy Loss for independent model
cross_entropy_loss = nn.CrossEntropyLoss()  # Cross-Entropy Loss for joint model

# Training loop with clear separation
for epoch in range(epochs):
    print("=" * 50)
    print(f"Starting Epoch {epoch+1} of Training\n")
    
    # Train Independent Model
    ind_loss = train_independent_model(independent_model, train_loader, independent_optimizer, bce_loss, device, epoch)
    independent_lr.step()
    print(f"\nEpoch {epoch+1} - Independent Model Training Complete. Loss: {ind_loss:.4f}")
    
    # Train Joint Model
    joint_loss = train_joint_model(joint_model, train_loader, joint_optimizer, cross_entropy_loss, device, epoch)
    joint_lr.step()
    print(f"\nEpoch {epoch+1} - Joint Model Training Complete. Loss: {joint_loss:.4f}")
    
    print("=" * 50)

Starting Epoch 1 of Training


Training Independent Probabilities Model - Epoch 1
Batch 10/1995, Loss: 0.1033
Batch 20/1995, Loss: 0.0842
Batch 30/1995, Loss: 0.1213
Batch 40/1995, Loss: 0.1194
Batch 50/1995, Loss: 0.1518
Batch 60/1995, Loss: 0.0767
Batch 70/1995, Loss: 0.1651
Batch 80/1995, Loss: 0.1530
Batch 90/1995, Loss: 0.0475
Batch 100/1995, Loss: 0.1063
Batch 110/1995, Loss: 0.1393
Batch 120/1995, Loss: 0.1938
Batch 130/1995, Loss: 0.2092
Batch 140/1995, Loss: 0.1707
Batch 150/1995, Loss: 0.1319
Batch 160/1995, Loss: 0.1122
Batch 170/1995, Loss: 0.1732
Batch 180/1995, Loss: 0.1831
Batch 190/1995, Loss: 0.0949
Batch 200/1995, Loss: 0.1206
Batch 210/1995, Loss: 0.1711
Batch 220/1995, Loss: 0.0763
Batch 230/1995, Loss: 0.1392
Batch 240/1995, Loss: 0.0974
Batch 250/1995, Loss: 0.1206
Batch 260/1995, Loss: 0.1571
Batch 270/1995, Loss: 0.1612
Batch 280/1995, Loss: 0.1075
Batch 290/1995, Loss: 0.1589
Batch 300/1995, Loss: 0.0973
Batch 310/1995, Loss: 0.0705
Batch 320/1995, Loss: 0.1504

In [19]:
# Function to compute evaluation metrics
def compute_metrics(model, data_loader, device, independent=True):
    model.eval()
    all_preds, all_labels = [], []
    print("\nEvaluating Model...")
    with torch.no_grad():
        for batch_idx, (texts, labels) in enumerate(data_loader, start=1):
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            if independent:
                preds = (outputs > 0.5).float()  # Binary predictions for independent model
            else:
                preds = torch.argmax(outputs, dim=1)
                preds = torch.eye(len(labels[0]), device=device)[preds]  # Convert to multi-label format
            all_preds.append(preds.cpu().numpy())
            all_labels.append(labels.cpu().numpy())

            # Progress log
            if batch_idx % 10 == 0:
                print(f"Processed {batch_idx}/{len(data_loader)} batches")

    all_preds = np.vstack(all_preds)
    all_labels = np.vstack(all_labels)

    return {
        'accuracy': accuracy_score(all_labels, all_preds),
        'f1': f1_score(all_labels, all_preds, average='micro'),
        'precision': precision_score(all_labels, all_preds, average='micro'),
        'recall': recall_score(all_labels, all_preds, average='micro')
    }

# Compute and print metrics
ind_metrics = compute_metrics(independent_model, val_loader, device, independent=True)
print("\nIndependent Model Metrics:")
print(ind_metrics)

joint_metrics = compute_metrics(joint_model, val_loader, device, independent=False)
print("\nJoint Model Metrics:")
print(joint_metrics)


Evaluating Model...
Processed 10/499 batches
Processed 20/499 batches
Processed 30/499 batches
Processed 40/499 batches
Processed 50/499 batches
Processed 60/499 batches
Processed 70/499 batches
Processed 80/499 batches
Processed 90/499 batches
Processed 100/499 batches
Processed 110/499 batches
Processed 120/499 batches
Processed 130/499 batches
Processed 140/499 batches
Processed 150/499 batches
Processed 160/499 batches
Processed 170/499 batches
Processed 180/499 batches
Processed 190/499 batches
Processed 200/499 batches
Processed 210/499 batches
Processed 220/499 batches
Processed 230/499 batches
Processed 240/499 batches
Processed 250/499 batches
Processed 260/499 batches
Processed 270/499 batches
Processed 280/499 batches
Processed 290/499 batches
Processed 300/499 batches
Processed 310/499 batches
Processed 320/499 batches
Processed 330/499 batches
Processed 340/499 batches
Processed 350/499 batches
Processed 360/499 batches
Processed 370/499 batches
Processed 380/499 batches
