Step 1: Preprocess Jigsaw Dataset (Clean, Balance, Split, Tokenize)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, RobertaTokenizer, DistilBertTokenizer
from datasets import Dataset

# Load dataset from Kaggle input
data_path = "/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv"
df = pd.read_csv(data_path)

# Clean dataset
df = df.drop_duplicates(subset=["comment_text"])
df = df[["comment_text", "toxic"]].dropna()

# Balance classes (optional)
toxic_df = df[df["toxic"] == 1]
non_toxic_df = df[df["toxic"] == 0].sample(n=len(toxic_df), random_state=42)
balanced_df = pd.concat([toxic_df, non_toxic_df]).sample(frac=1, random_state=42)

# Split: 70% train, 15% val, 15% test
train_df, temp_df = train_test_split(balanced_df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Rename 'toxic' to 'labels' for Trainer compatibility
train_df = train_df.rename(columns={"toxic": "labels"})
val_df = val_df.rename(columns={"toxic": "labels"})
test_df = test_df.rename(columns={"toxic": "labels"})

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenization function
def tokenize_dataset(dataset, tokenizer, max_length=64):
    def tokenize_function(examples):
        return tokenizer(examples["comment_text"], padding="max_length", truncation=True, max_length=max_length)
    return dataset.map(tokenize_function, batched=True)

# Initialize tokenizers
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
distilbert_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize for each model
train_bert = tokenize_dataset(train_dataset, bert_tokenizer)
val_bert = tokenize_dataset(val_dataset, bert_tokenizer)
test_bert = tokenize_dataset(test_dataset, bert_tokenizer)

train_roberta = tokenize_dataset(train_dataset, roberta_tokenizer)
val_roberta = tokenize_dataset(val_dataset, roberta_tokenizer)
test_roberta = tokenize_dataset(test_dataset, roberta_tokenizer)

train_distilbert = tokenize_dataset(train_dataset, distilbert_tokenizer)
val_distilbert = tokenize_dataset(val_dataset, distilbert_tokenizer)
test_distilbert = tokenize_dataset(test_dataset, distilbert_tokenizer)

# Save to Kaggle working directory
train_bert.save_to_disk("/kaggle/working/preprocessed/train_bert")
val_bert.save_to_disk("/kaggle/working/preprocessed/val_bert")
test_bert.save_to_disk("/kaggle/working/preprocessed/test_bert")
train_roberta.save_to_disk("/kaggle/working/preprocessed/train_roberta")
val_roberta.save_to_disk("/kaggle/working/preprocessed/val_roberta")
test_roberta.save_to_disk("/kaggle/working/preprocessed/test_roberta")
train_distilbert.save_to_disk("/kaggle/working/preprocessed/train_distilbert")
val_distilbert.save_to_disk("/kaggle/working/preprocessed/val_distilbert")
test_distilbert.save_to_disk("/kaggle/working/preprocessed/test_distilbert")

Step 2: Fine-Tune BERT, DistilBERT, RoBERTa with Validation Set

In [None]:
from transformers import (BertForSequenceClassification, RobertaForSequenceClassification, 
                          DistilBertForSequenceClassification, Trainer, TrainingArguments)
import torch

# Load preprocessed datasets
train_bert = Dataset.load_from_disk("/kaggle/working/preprocessed/train_bert")
val_bert = Dataset.load_from_disk("/kaggle/working/preprocessed/val_bert")
train_roberta = Dataset.load_from_disk("/kaggle/working/preprocessed/train_roberta")
val_roberta = Dataset.load_from_disk("/kaggle/working/preprocessed/val_roberta")
train_distilbert = Dataset.load_from_disk("/kaggle/working/preprocessed/train_distilbert")
val_distilbert = Dataset.load_from_disk("/kaggle/working/preprocessed/val_distilbert")

# Load models
bert_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
roberta_model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
distilbert_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Training arguments
training_args = TrainingArguments(
    output_dir="/kaggle/working/models",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    report_to="none"
)

# Fine-tune function
def fine_tune_model(model, train_dataset, val_dataset, model_name):
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
    )
    trainer.train()
    trainer.save_model(f"/kaggle/working/models/{model_name}")
    return trainer

# Fine-tune each model
bert_trainer = fine_tune_model(bert_model, train_bert, val_bert, "bert_finetuned")
roberta_trainer = fine_tune_model(roberta_model, train_roberta, val_roberta, "roberta_finetuned")
distilbert_trainer = fine_tune_model(distilbert_model, train_distilbert, val_distilbert, "distilbert_finetuned")

# Save tokenizers
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
distilbert_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
bert_tokenizer.save_pretrained("/kaggle/working/models/bert_finetuned")
roberta_tokenizer.save_pretrained("/kaggle/working/models/roberta_finetuned")
distilbert_tokenizer.save_pretrained("/kaggle/working/models/distilbert_finetuned")

Step 3: Evaluate on Test Set and Document Results

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import pandas as pd
from transformers import BertForSequenceClassification, RobertaForSequenceClassification, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Load test datasets
test_bert = Dataset.load_from_disk("/kaggle/working/preprocessed/test_bert")
test_roberta = Dataset.load_from_disk("/kaggle/working/preprocessed/test_roberta")
test_distilbert = Dataset.load_from_disk("/kaggle/working/preprocessed/test_distilbert")

# Load fine-tuned models
bert_model = BertForSequenceClassification.from_pretrained("/kaggle/working/models/bert_finetuned")
roberta_model = RobertaForSequenceClassification.from_pretrained("/kaggle/working/models/roberta_finetuned")
distilbert_model = DistilBertForSequenceClassification.from_pretrained("/kaggle/working/models/distilbert_finetuned")

# Define training args for evaluation (no training, just eval)
training_args = TrainingArguments(
    output_dir="/kaggle/working/models",
    eval_strategy="epoch",  # Still works since we provide eval_dataset
    per_device_eval_batch_size=8,
    report_to="none"
)

# Compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# Evaluate each model with its test dataset
bert_trainer = Trainer(
    model=bert_model,
    args=training_args,
    eval_dataset=test_bert,  # Pass test set as eval_dataset
    compute_metrics=compute_metrics
)
roberta_trainer = Trainer(
    model=roberta_model,
    args=training_args,
    eval_dataset=test_roberta,  # Pass test set as eval_dataset
    compute_metrics=compute_metrics
)
distilbert_trainer = Trainer(
    model=distilbert_model,
    args=training_args,
    eval_dataset=test_distilbert,  # Pass test set as eval_dataset
    compute_metrics=compute_metrics
)

# Run evaluation
bert_eval = bert_trainer.evaluate()
roberta_eval = roberta_trainer.evaluate()
distilbert_eval = distilbert_trainer.evaluate()

# Document results
results = {
    "Model": ["BERT", "RoBERTa", "DistilBERT"],
    "Accuracy": [bert_eval["eval_accuracy"], roberta_eval["eval_accuracy"], distilbert_eval["eval_accuracy"]],
    "F1": [bert_eval["eval_f1"], roberta_eval["eval_f1"], distilbert_eval["eval_f1"]],
    "Precision": [bert_eval["eval_precision"], roberta_eval["eval_precision"], distilbert_eval["eval_precision"]],
    "Recall": [bert_eval["eval_recall"], roberta_eval["eval_recall"], distilbert_eval["eval_recall"]]
}
results_df = pd.DataFrame(results)
results_df.to_csv("/kaggle/working/model_performance.csv", index=False)
print(results_df)

Step 4: Test on Curated Toxic/Non-Toxic Examples

In [None]:
import torch
from transformers import BertTokenizer, RobertaTokenizer, DistilBertTokenizer

# Load tokenizers
bert_tokenizer = BertTokenizer.from_pretrained("/kaggle/working/models/bert_finetuned")
roberta_tokenizer = RobertaTokenizer.from_pretrained("/kaggle/working/models/roberta_finetuned")
distilbert_tokenizer = DistilBertTokenizer.from_pretrained("/kaggle/working/models/distilbert_finetuned")

# Move models to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model.to(device)
roberta_model.to(device)
distilbert_model.to(device)

# Curated examples
examples = [
    {"text": "You’re a complete idiot!", "label": 1},
    {"text": "Nice work, well done!", "label": 0},
    {"text": "Die, you worthless scum.", "label": 1},
    {"text": "This is a great discussion.", "label": 0},
]

# Prediction function (ensure inputs are on the same device as model)
def predict_toxicity(model, tokenizer, text, device):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=64)
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Move inputs to GPU
    with torch.no_grad():  # Disable gradient computation for inference
        outputs = model(**inputs)
    probs = torch.softmax(outputs.logits, dim=-1)
    pred = probs.argmax(-1).item()
    return pred, probs[0][1].item()

# Test each model
for example in examples:
    text, true_label = example["text"], example["label"]
    print(f"\nText: {text}")
    for model_name, model, tokenizer in [
        ("BERT", bert_model, bert_tokenizer),
        ("RoBERTa", roberta_model, roberta_tokenizer),
        ("DistilBERT", distilbert_model, distilbert_tokenizer),
    ]:
        pred, prob = predict_toxicity(model, tokenizer, text, device)
        print(f"{model_name}: Predicted={pred}, Toxic Prob={prob:.4f}, True={true_label}")

Step 5: Prepare Models for Web App (Test Inference, Optimize)

In [None]:
from transformers import pipeline
import time

# Use DistilBERT for inference
classifier = pipeline("text-classification", model="/kaggle/working/models/distilbert_finetuned", 
                      tokenizer=distilbert_tokenizer)

# Test inference
start_time = time.time()
result = classifier("You’re a disgusting human being!")
end_time = time.time()
print(f"Prediction: {result}, Latency: {end_time - start_time:.4f}s")

# Additional test for confirmation
start_time = time.time()
result = classifier("Great job, keep it up!")
end_time = time.time()
print(f"Prediction: {result}, Latency: {end_time - start_time:.4f}s")

Testing with Youtube Comments

In [None]:
#Testing on youtube comments datasets

import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, RobertaTokenizer, RobertaForSequenceClassification, DistilBertTokenizer, DistilBertForSequenceClassification
from datasets import Dataset

# Step 1: Load CSV file from Kaggle environment
csv_file_path = "/kaggle/input/youtube-toxicity-data/youtoxic_english_1000.csv"  # Replace with your actual CSV file path
comments_df = pd.read_csv(csv_file_path)

# Ensure the column name matches your CSV file (adjust if different)
if "Text" not in comments_df.columns:
    print("Column 'comment' not found. Available columns:", comments_df.columns)
else:
    print(f"Loaded CSV from {csv_file_path} with {len(comments_df)} comments.")

# Step 2: Convert to Hugging Face Dataset
comments_dataset = Dataset.from_pandas(comments_df[["Text"]])

# Step 3: Tokenizers for the three fine-tuned models
bert_tokenizer = BertTokenizer.from_pretrained("/kaggle/working/models/bert_finetuned")
roberta_tokenizer = RobertaTokenizer.from_pretrained("/kaggle/working/models/roberta_finetuned")
distilbert_tokenizer = DistilBertTokenizer.from_pretrained("/kaggle/working/models/distilbert_finetuned")

# Step 4: Tokenization function
def tokenize_dataset(dataset, tokenizer, max_length=64):
    def tokenize_function(examples):
        return tokenizer(examples["Text"], padding="max_length", truncation=True, max_length=max_length)
    return dataset.map(tokenize_function, batched=True)

# Tokenize datasets for all three models
comments_bert = tokenize_dataset(comments_dataset, bert_tokenizer)
comments_roberta = tokenize_dataset(comments_dataset, roberta_tokenizer)
comments_distilbert = tokenize_dataset(comments_dataset, distilbert_tokenizer)

# Step 5: Load the models
bert_model = BertForSequenceClassification.from_pretrained("/kaggle/working/models/bert_finetuned")
roberta_model = RobertaForSequenceClassification.from_pretrained("/kaggle/working/models/roberta_finetuned")
distilbert_model = DistilBertForSequenceClassification.from_pretrained("/kaggle/working/models/distilbert_finetuned")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move models to device
bert_model.to(device)
roberta_model.to(device)
distilbert_model.to(device)

# Step 6: Predict toxicity function
def predict_toxicity(model, tokenizer, dataset, device):
    model.eval()
    predictions = []
    probabilities = []
    comments = dataset["Text"]
    for comment in comments:
        inputs = tokenizer(comment, return_tensors="pt", padding=True, truncation=True, max_length=64)
        inputs = {key: value.to(device) for key, value in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=-1)
        pred = probs.argmax(-1).item()
        toxic_prob = probs[0][1].item()
        predictions.append(pred)
        probabilities.append(toxic_prob)
    return predictions, probabilities

# Step 7: Get predictions for each model
print("Predicting with BERT...")
bert_preds, bert_probs = predict_toxicity(bert_model, bert_tokenizer, comments_bert, device)

print("Predicting with RoBERTa...")
roberta_preds, roberta_probs = predict_toxicity(roberta_model, roberta_tokenizer, comments_roberta, device)

print("Predicting with DistilBERT...")
distilbert_preds, distilbert_probs = predict_toxicity(distilbert_model, distilbert_tokenizer, comments_distilbert, device)

# Step 8: Combine and save results
results_df = pd.DataFrame({
    "Comment": comments_df["Text"],
    "BERT_Pred": bert_preds,
    "BERT_Toxic_Prob": bert_probs,
    "RoBERTa_Pred": roberta_preds,
    "RoBERTa_Toxic_Prob": roberta_probs,
    "DistilBERT_Pred": distilbert_preds,
    "DistilBERT_Toxic_Prob": distilbert_probs
})

results_csv_path = "/kaggle/working/combined_model_predictions_YTD.csv"
results_df.to_csv(results_csv_path, index=False)
print(f"Predictions saved to {results_csv_path}")
print(results_df.head(10))  # Show first 10 rows for inspection
