In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from torch.optim import AdamW
from tqdm import tqdm
import pandas as pd
import joblib

In [3]:
# =========================================================
# Step 1: Setup device and load data
# =========================================================
device = torch.device("cuda" if torch.backends.mps.is_available() else "cpu")
print(f"âœ… Using device: {device}")

train_path = '../Data/Processed/train.csv'
train_df = pd.read_csv(train_path)

âœ… Using device: mps


In [4]:

# =========================================================
# Step 2: Tokenization
# =========================================================
MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class HateSpeechDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        encodings = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in encodings.items()}
        item["labels"] = torch.tensor(label)
        return item

dataset = HateSpeechDataset(train_df["text"], train_df["label"], tokenizer)
train_loader = DataLoader(dataset, batch_size=8, shuffle=True)

In [None]:

# =========================================================
# Step 3: Model setup
# =========================================================
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 2
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)


In [None]:

# =========================================================
# Step 4: Training loop
# =========================================================
model.train()
for epoch in range(num_epochs):
    print(f"\nðŸ§  Epoch {epoch + 1}/{num_epochs}")
    loop = tqdm(train_loader, leave=False)
    total_loss = 0

    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        loop.set_description(f"Epoch {epoch + 1}")
        loop.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(train_loader)
    print(f"âœ… Epoch {epoch + 1} average loss: {avg_loss:.4f}")

In [None]:
# =========================================================
# Step 5: Save model + tokenizer
# =========================================================
output_dir = "./Distilbert/hate_speech_distilbert"
os.makedirs(output_dir, exist_ok=True)

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

joblib.dump({"model": MODEL_NAME}, f"{output_dir}/model_info.pkl")

print(f"\nâœ… Model and tokenizer saved to: {output_dir}")