In [1]:
!pip install -q transformers accelerate datasets scikit-learn pandas

import os
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import get_linear_schedule_with_warmup

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

from google.colab import files
uploaded = files.upload()  # upload final_10k_dataset_cleaned.csv
df = pd.read_csv("final_10k_dataset_cleaned.csv")

unique_labels = sorted(df["final_category"].unique())
label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {v: k for k, v in label2id.items()}
df["label_id"] = df["final_category"].map(label2id)
num_labels = len(label2id)

train_df, val_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df["label_id"],
)

MODEL_NAME = "microsoft/deberta-v3-base"

class ToxicDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = list(texts)
        self.labels = list(labels)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        enc = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long),
        }

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_dataset = ToxicDataset(
    texts=train_df["text"],
    labels=train_df["label_id"],
    tokenizer=tokenizer,
)

val_dataset = ToxicDataset(
    texts=val_df["text"],
    labels=val_df["label_id"],
    tokenizer=tokenizer,
)

batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
).to(device)

epochs = 5
lr = 2e-5
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
total_steps = len(train_loader) * epochs
warmup_steps = int(0.1 * total_steps)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps,
)


Saving final_10k_dataset_cleaned.csv to final_10k_dataset_cleaned.csv


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
def run_epoch(model, loader, optimizer=None, scheduler=None, train=True):
    if train:
        model.train()
    else:
        model.eval()

    total_loss = 0
    all_labels = []
    all_preds = []

    loop = tqdm(loader)

    for batch in loop:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        with torch.set_grad_enabled(train):
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
            )
            loss = outputs.loss
            logits = outputs.logits

            if train:
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()

        total_loss += loss.item() * input_ids.size(0)
        preds = torch.argmax(logits, dim=-1)

        all_labels.extend(labels.cpu().numpy().tolist())
        all_preds.extend(preds.cpu().numpy().tolist())

    avg_loss = total_loss / len(loader.dataset)
    acc = accuracy_score(all_labels, all_preds)
    macro_f1 = f1_score(all_labels, all_preds, average="macro")

    return avg_loss, acc, macro_f1


for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    train_loss, train_acc, train_f1 = run_epoch(
        model,
        train_loader,
        optimizer=optimizer,
        scheduler=scheduler,
        train=True,
    )
    print(f"Train Loss {train_loss:.4f} Acc {train_acc:.4f} F1 {train_f1:.4f}")

    val_loss, val_acc, val_f1 = run_epoch(
        model,
        val_loader,
        optimizer=None,
        scheduler=None,
        train=False,
    )
    print(f"Val Loss {val_loss:.4f} Acc {val_acc:.4f} F1 {val_f1:.4f}")


Epoch 1/5


  0%|          | 0/500 [00:00<?, ?it/s]

Train Loss 1.7774 Acc 0.3385 F1 0.3299


  0%|          | 0/125 [00:00<?, ?it/s]

Val Loss 1.2609 Acc 0.5325 F1 0.5104
Epoch 2/5


  0%|          | 0/500 [00:00<?, ?it/s]

Train Loss 1.0586 Acc 0.6046 F1 0.5984


  0%|          | 0/125 [00:00<?, ?it/s]

Val Loss 1.1027 Acc 0.5950 F1 0.5874
Epoch 3/5


  0%|          | 0/500 [00:00<?, ?it/s]

Train Loss 0.8284 Acc 0.6902 F1 0.6862


  0%|          | 0/125 [00:00<?, ?it/s]

Val Loss 1.1298 Acc 0.6170 F1 0.6079
Epoch 4/5


  0%|          | 0/500 [00:00<?, ?it/s]

Train Loss 0.6617 Acc 0.7563 F1 0.7535


  0%|          | 0/125 [00:00<?, ?it/s]

Val Loss 1.1665 Acc 0.6250 F1 0.6187
Epoch 5/5


  0%|          | 0/500 [00:00<?, ?it/s]

Train Loss 0.5499 Acc 0.8039 F1 0.8018


  0%|          | 0/125 [00:00<?, ?it/s]

Val Loss 1.1795 Acc 0.6260 F1 0.6189


In [3]:
model.save_pretrained("experiment_2_model")
tokenizer.save_pretrained("experiment_2_tokenizer")

('experiment_2_tokenizer/tokenizer_config.json',
 'experiment_2_tokenizer/special_tokens_map.json',
 'experiment_2_tokenizer/spm.model',
 'experiment_2_tokenizer/added_tokens.json',
 'experiment_2_tokenizer/tokenizer.json')