In [1]:
!pip install --upgrade sympy
!pip install --upgrade transformers




In [6]:
import re
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EvalPrediction
)

# ---------------------------
# 1. Define text cleaning function
# ---------------------------
def clean_text(text):
    """Preprocess text: Lowercase, remove punctuation, special characters, extra spaces."""
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# ---------------------------
# 2. Load and preprocess the dataset
# ---------------------------
df = pd.read_csv("youtoxic_english_1000.csv")
df['clean_text'] = df['Text'].apply(clean_text)

# Define the toxicity label columns
label_cols = [
    "IsToxic", "IsAbusive", "IsThreat", "IsProvocative", "IsObscene",
    "IsHatespeech", "IsRacist", "IsNationalist", "IsSexist",
    "IsHomophobic", "IsReligiousHate", "IsRadicalism"
]

# Ensure labels are integers.
for col in label_cols:
    df[col] = df[col].astype(int)

# Multi-label targets: each row has 12 binary labels.
labels = df[label_cols].values.tolist()

# ---------------------------
# 3. Split into train and validation sets
# ---------------------------
X_train, X_val, y_train, y_val = train_test_split(
    df['clean_text'].tolist(), labels, test_size=0.2, random_state=42
)

# ---------------------------
# 4. Prepare the Hugging Face tokenizer and model
# ---------------------------
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)

# Set the number of labels and initialize the model.
num_labels = len(label_cols)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
# IMPORTANT: Set the model to use multi-label classification loss.
model.config.problem_type = "multi_label_classification"

# ---------------------------
# 5. Create a custom PyTorch Dataset
# ---------------------------
class ToxicDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        # Remove extra batch dimension and convert labels to tensor of floats.
        item = {key: encoding[key].squeeze() for key in encoding}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

max_length = 128
train_dataset = ToxicDataset(X_train, y_train, tokenizer, max_length)
val_dataset = ToxicDataset(X_val, y_val, tokenizer, max_length)

# Data collator to handle dynamic padding.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# ---------------------------
# 6. Define a compute_metrics function
# ---------------------------
def compute_metrics(eval_pred: EvalPrediction):
    logits, labels = eval_pred
    # For multi-label classification, apply sigmoid then threshold at 0.5.
    probs = torch.sigmoid(torch.tensor(logits))
    predictions = (probs >= 0.5).int().numpy()
    accuracy = (predictions == labels).mean()
    return {"accuracy": accuracy}

# ---------------------------
# 7. Set up training arguments and Trainer
# ---------------------------
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=15,               # Adjust as needed.
    per_device_train_batch_size=8,    # Adjust based on your available memory.
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# ---------------------------
# 8. Fine-tune the model
# ---------------------------
print("Starting training...")
trainer.train()
print("Training complete.")

# ---------------------------
# 9. Save the fine-tuned model and tokenizer
# ---------------------------
model_save_path = "toxic_bert_classifier"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model and tokenizer saved to '{model_save_path}'.")

# ---------------------------
# 10. Load the saved model and tokenizer (using Option 2)
# ---------------------------
loaded_model = BertForSequenceClassification.from_pretrained(model_save_path)
loaded_tokenizer = BertTokenizer.from_pretrained(model_save_path)
loaded_model.config.problem_type = "multi_label_classification"
loaded_model.eval()  # Set model to evaluation mode

# ---------------------------
# 11. Define a helper function to classify new comments using the loaded model/tokenizer
# ---------------------------
def classify_comment(text, threshold=0.5):
    """
    Cleans the text, tokenizes, and uses the loaded model to predict toxicity scores.
    Returns "Positive" if no toxicity label is triggered; otherwise, returns the list of triggered labels.
    """
    cleaned = clean_text(text)
    inputs = loaded_tokenizer(cleaned, truncation=True, padding='max_length', max_length=max_length, return_tensors="pt")
    with torch.no_grad():
        outputs = loaded_model(**inputs)
    logits = outputs.logits[0]
    probs = torch.sigmoid(logits).detach().numpy()
    triggered = [label_cols[i] for i, p in enumerate(probs) if p >= threshold]
    if not triggered:
        return "Positive"
    else:
        return "Negative: " + ", ".join(triggered)

# ---------------------------
# 12. Test the classifier with some sample comments
# ---------------------------
sample_comments = [
    "I love this video, it's amazing!",
    "This is the worst, you are idiots and trash!",
    "I think this content is sexist and racist, very offensive.",
    "Great work, keep it up!",
    "I hate this content.",
    "This content is garbage",
    "Fuck you"
]

print("\n--- Testing on sample comments ---")
for comment in sample_comments:
    result = classify_comment(comment)
    print(f"Comment: \"{comment}\"\nClassification: {result}\n")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.232,0.28541,0.872083
2,0.2115,0.242326,0.90375
3,0.1742,0.22483,0.907083
4,0.107,0.216807,0.917083
5,0.0922,0.247216,0.911667
6,0.0589,0.270436,0.910833
7,0.0353,0.238481,0.919583
8,0.0411,0.264097,0.915
9,0.0326,0.27743,0.914583
10,0.0295,0.277211,0.913333


Training complete.
Model and tokenizer saved to 'toxic_bert_classifier'.

--- Testing on sample comments ---
Comment: "I love this video, it's amazing!"
Classification: Positive

Comment: "This is the worst, you are idiots and trash!"
Classification: Negative: IsToxic, IsAbusive

Comment: "I think this content is sexist and racist, very offensive."
Classification: Positive

Comment: "Great work, keep it up!"
Classification: Positive

Comment: "I hate this content."
Classification: Positive

Comment: "This content is garbage"
Classification: Negative: IsToxic, IsAbusive

Comment: "Fuck you"
Classification: Negative: IsToxic, IsAbusive

