In [1]:
!pip install -q transformers accelerate datasets scikit-learn pandas

import os
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import get_linear_schedule_with_warmup

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

from google.colab import files
uploaded = files.upload()

df = pd.read_csv("final_10k_dataset_cleaned.csv")

unique_labels = sorted(df["final_category"].unique())
label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {v: k for k, v in label2id.items()}
df["label_id"] = df["final_category"].map(label2id)
num_labels = len(label2id)

train_df, val_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df["label_id"]
)

class ToxicDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = list(texts)
        self.labels = list(labels)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            str(self.texts[idx]),
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(int(self.labels[idx]), dtype=torch.long),
        }

model_list = [
    "distilbert-base-uncased",
    "bert-base-uncased",
    "microsoft/deberta-v3-base"
]

batch_size = 16
epochs = 5
lr = 2e-5
max_length = 256


Saving final_10k_dataset_cleaned.csv to final_10k_dataset_cleaned.csv


In [2]:
def run_epoch(model, loader, optimizer=None, scheduler=None, train=True):
    if train:
        model.train()
    else:
        model.eval()

    total_loss = 0
    all_labels = []
    all_preds = []

    loop = tqdm(loader)

    for batch in loop:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        with torch.set_grad_enabled(train):
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss
            logits = outputs.logits

            if train:
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()

        total_loss += loss.item() * input_ids.size(0)
        preds = torch.argmax(logits, dim=-1)
        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(preds.cpu().numpy())

    avg_loss = total_loss / len(loader.dataset)
    acc = accuracy_score(all_labels, all_preds)
    macro_f1 = f1_score(all_labels, all_preds, average="macro")

    return avg_loss, acc, macro_f1


for model_name in model_list:
    print(f"\nTraining model: {model_name}\n")

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    train_dataset = ToxicDataset(
        train_df["text"],
        train_df["label_id"],
        tokenizer,
        max_length=max_length
    )

    val_dataset = ToxicDataset(
        val_df["text"],
        val_df["label_id"],
        tokenizer,
        max_length=max_length
    )

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels,
        label2id=label2id,
        id2label=id2label
    ).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    total_steps = len(train_loader) * epochs
    warmup_steps = int(0.1 * total_steps)

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps
    )

    for epoch in range(epochs):
        print(f"Epoch {epoch+1}/{epochs}")

        train_loss, train_acc, train_f1 = run_epoch(
            model, train_loader, optimizer=optimizer, scheduler=scheduler, train=True
        )
        print(f"Train Loss {train_loss:.4f} Acc {train_acc:.4f} F1 {train_f1:.4f}")

        val_loss, val_acc, val_f1 = run_epoch(
            model, val_loader, optimizer=None, scheduler=None, train=False
        )
        print(f"Val Loss {val_loss:.4f} Acc {val_acc:.4f} F1 {val_f1:.4f}")

    save_dir = f"exp3_{model_name.replace('/', '_')}"
    os.makedirs(save_dir, exist_ok=True)
    model.save_pretrained(save_dir)
    tokenizer.save_pretrained(save_dir)

    import json
    with open(os.path.join(save_dir, "label2id.json"), "w") as f:
        json.dump(label2id, f, indent=2)

    print(f"Saved model to {save_dir}")



Training model: distilbert-base-uncased



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5


  0%|          | 0/500 [00:00<?, ?it/s]

Train Loss 1.7657 Acc 0.3814 F1 0.3652


  0%|          | 0/125 [00:00<?, ?it/s]

Val Loss 1.2905 Acc 0.5245 F1 0.4997
Epoch 2/5


  0%|          | 0/500 [00:00<?, ?it/s]

Train Loss 1.0747 Acc 0.6191 F1 0.6096


  0%|          | 0/125 [00:00<?, ?it/s]

Val Loss 1.1417 Acc 0.5950 F1 0.5804
Epoch 3/5


  0%|          | 0/500 [00:00<?, ?it/s]

Train Loss 0.8281 Acc 0.7019 F1 0.6968


  0%|          | 0/125 [00:00<?, ?it/s]

Val Loss 1.1456 Acc 0.5970 F1 0.5870
Epoch 4/5


  0%|          | 0/500 [00:00<?, ?it/s]

Train Loss 0.6501 Acc 0.7716 F1 0.7689


  0%|          | 0/125 [00:00<?, ?it/s]

Val Loss 1.1656 Acc 0.5975 F1 0.5940
Epoch 5/5


  0%|          | 0/500 [00:00<?, ?it/s]

Train Loss 0.5472 Acc 0.8109 F1 0.8088


  0%|          | 0/125 [00:00<?, ?it/s]

Val Loss 1.1790 Acc 0.6060 F1 0.5998
Saved model to exp3_distilbert-base-uncased

Training model: bert-base-uncased



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5


  0%|          | 0/500 [00:00<?, ?it/s]

Train Loss 1.8563 Acc 0.3368 F1 0.3225


  0%|          | 0/125 [00:00<?, ?it/s]

Val Loss 1.2888 Acc 0.5410 F1 0.5049
Epoch 2/5


  0%|          | 0/500 [00:00<?, ?it/s]

Train Loss 1.0667 Acc 0.6158 F1 0.6074


  0%|          | 0/125 [00:00<?, ?it/s]

Val Loss 1.1367 Acc 0.5880 F1 0.5801
Epoch 3/5


  0%|          | 0/500 [00:00<?, ?it/s]

Train Loss 0.7732 Acc 0.7192 F1 0.7161


  0%|          | 0/125 [00:00<?, ?it/s]

Val Loss 1.1313 Acc 0.6040 F1 0.5981
Epoch 4/5


  0%|          | 0/500 [00:00<?, ?it/s]

Train Loss 0.5659 Acc 0.8009 F1 0.7991


  0%|          | 0/125 [00:00<?, ?it/s]

Val Loss 1.1593 Acc 0.6115 F1 0.6074
Epoch 5/5


  0%|          | 0/500 [00:00<?, ?it/s]

Train Loss 0.4398 Acc 0.8552 F1 0.8544


  0%|          | 0/125 [00:00<?, ?it/s]

Val Loss 1.1881 Acc 0.6120 F1 0.6068
Saved model to exp3_bert-base-uncased

Training model: microsoft/deberta-v3-base



tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5


  0%|          | 0/500 [00:00<?, ?it/s]

Train Loss 1.7903 Acc 0.3299 F1 0.3362


  0%|          | 0/125 [00:00<?, ?it/s]

Val Loss 1.3336 Acc 0.5155 F1 0.4960
Epoch 2/5


  0%|          | 0/500 [00:00<?, ?it/s]

Train Loss 1.0864 Acc 0.5927 F1 0.5866


  0%|          | 0/125 [00:00<?, ?it/s]

Val Loss 1.1467 Acc 0.5855 F1 0.5734
Epoch 3/5


  0%|          | 0/500 [00:00<?, ?it/s]

Train Loss 0.8510 Acc 0.6756 F1 0.6714


  0%|          | 0/125 [00:00<?, ?it/s]

Val Loss 1.2109 Acc 0.5895 F1 0.5702
Epoch 4/5


  0%|          | 0/500 [00:00<?, ?it/s]

Train Loss 0.6760 Acc 0.7465 F1 0.7435


  0%|          | 0/125 [00:00<?, ?it/s]

Val Loss 1.1518 Acc 0.6165 F1 0.6101
Epoch 5/5


  0%|          | 0/500 [00:00<?, ?it/s]

Train Loss 0.5582 Acc 0.7960 F1 0.7936


  0%|          | 0/125 [00:00<?, ?it/s]

Val Loss 1.1833 Acc 0.6105 F1 0.6038
Saved model to exp3_microsoft_deberta-v3-base


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import os
import shutil

# Define the destination directory in Google Drive
drive_path = "/content/drive/MyDrive/colab_toxic_model_checkpoints"
os.makedirs(drive_path, exist_ok=True)

print(f"Copying models to: {drive_path}")

for model_name in model_list:
    source_dir = f"exp3_{model_name.replace('/', '_')}"
    destination_dir = os.path.join(drive_path, source_dir)

    if os.path.exists(source_dir):
        # Check if the destination directory already exists. If so, remove it to avoid errors.
        if os.path.exists(destination_dir):
            print(f"Removing existing directory: {destination_dir}")
            shutil.rmtree(destination_dir)

        print(f"Copying {source_dir} to {destination_dir}")
        shutil.copytree(source_dir, destination_dir)
        print(f"Successfully copied {source_dir}")
    else:
        print(f"Source directory {source_dir} does not exist. Skipping copy for this model.")

print("All specified models have been processed.")

Copying models to: /content/drive/MyDrive/colab_toxic_model_checkpoints
Copying exp3_distilbert-base-uncased to /content/drive/MyDrive/colab_toxic_model_checkpoints/exp3_distilbert-base-uncased
Successfully copied exp3_distilbert-base-uncased
Copying exp3_bert-base-uncased to /content/drive/MyDrive/colab_toxic_model_checkpoints/exp3_bert-base-uncased
Successfully copied exp3_bert-base-uncased
Copying exp3_microsoft_deberta-v3-base to /content/drive/MyDrive/colab_toxic_model_checkpoints/exp3_microsoft_deberta-v3-base
Successfully copied exp3_microsoft_deberta-v3-base
All specified models have been processed.
