<a href="https://colab.research.google.com/github/Ishq01/cs562/blob/main/562_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
print("CUDA Available:", torch.cuda.is_available())
print("PyTorch compiled with CUDA version:", torch.version.cuda)

CUDA Available: True
PyTorch compiled with CUDA version: 12.6


In [None]:
!pip install opacus

Collecting opacus
  Downloading opacus-1.5.4-py3-none-any.whl.metadata (8.7 kB)
Downloading opacus-1.5.4-py3-none-any.whl (254 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m254.4/254.4 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: opacus
Successfully installed opacus-1.5.4


In [None]:
# -*- coding: utf-8 -*-
"""
RoBERTa QA with Differential Privacy (Opacus)
Optimized for CS 562
"""

# 1. Install necessary libraries
import sys

import torch
import random
import numpy as np
import os
import re
import pandas as pd
from typing import List, Dict, Any
from tqdm import tqdm
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    get_linear_schedule_with_warmup
)
from opacus import PrivacyEngine
from opacus.validators import ModuleValidator
from opacus.utils.batch_memory_manager import BatchMemoryManager

class Config:
    MODEL_NAME = "microsoft/deberta-v3-base"

    MAX_LENGTH = 384
    DOC_STRIDE = 128

    BATCH_SIZE = 8
    GRAD_ACCUM_STEPS = 4

    LEARNING_RATE = 3e-5

    EPOCHS = 4
    WEIGHT_DECAY = 0.01

    SEED = 42
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

    # DP Specifics
    MAX_GRAD_NORM = 1.0
    DELTA = 1e-5

config = Config()

# Reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(config.SEED)
print(f"Running on {config.DEVICE} with {config.MODEL_NAME}")

Running on cuda with microsoft/deberta-v3-base


In [None]:
# --- CELL 4: FAST DATA PROCESSING (Aggressive Filtering) ---
from datasets import Dataset as HFDataset

tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME, use_fast=True)

def clean_hotpot_data(dataset_split, max_examples=None):
    # (This function stays the same as before)
    records = []
    indices = range(len(dataset_split))
    if max_examples:
        indices = indices[:max_examples]

    for i in tqdm(indices, desc="Cleaning Data"):
        ex = dataset_split[i]
        ans = ex["answer"]
        if not ans or ans.lower() in ["yes", "no"]:
            continue
        context_str = ""
        for item in ex["context"]["sentences"]:
             paragraph = " ".join(item)
             context_str += paragraph + " "
        context_str = context_str.strip()
        start_idx = context_str.lower().find(ans.lower())
        if start_idx == -1: continue
        end_idx = start_idx + len(ans)
        records.append({
            "id": ex["id"],
            "question": ex["question"],
            "context": context_str,
            "answer_text": ans,
            "answer_start": start_idx,
            "answer_end": end_idx
        })
    return records

def prepare_train_features(examples):
    tokenized = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=config.MAX_LENGTH,
        stride=config.DOC_STRIDE,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    sample_mapping = tokenized.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized.pop("offset_mapping")

    out = {"input_ids": [], "attention_mask": [], "start_positions": [], "end_positions": [], "example_id": []}

    for i, offsets in enumerate(offset_mapping):
        sample_index = sample_mapping[i]
        start_char = examples["answer_start"][sample_index]
        end_char = examples["answer_end"][sample_index]
        ex_id = examples["id"][sample_index]
        sequence_ids = tokenized.sequence_ids(i)

        idx = 0
        while sequence_ids[idx] != 1: idx += 1
        context_start = idx
        while idx < len(sequence_ids) and sequence_ids[idx] == 1: idx += 1
        context_end = idx - 1

        is_answer_in_context = (offsets[context_start][0] <= start_char and offsets[context_end][1] >= end_char)

        if not is_answer_in_context:
            continue

        # Calculate positions
        idx_start = context_start
        while idx_start <= context_end and offsets[idx_start][0] <= start_char: idx_start += 1
        s_pos = idx_start - 1
        idx_end = context_end
        while idx_end >= context_start and offsets[idx_end][1] >= end_char: idx_end -= 1
        e_pos = idx_end + 1

        out["input_ids"].append(tokenized["input_ids"][i])
        out["attention_mask"].append(tokenized["attention_mask"][i])
        out["start_positions"].append(s_pos)
        out["end_positions"].append(e_pos)
        out["example_id"].append(ex_id)

    return out

raw_dataset = load_dataset("hotpot_qa", "distractor")

print("1. Cleaning Text Data...")
train_records = clean_hotpot_data(raw_dataset["train"], max_examples=None) # Full Dataset
val_records = clean_hotpot_data(raw_dataset["validation"], max_examples=2000)

print(f"2. Converting to HF Datasets (Train: {len(train_records)})...")
train_hf = HFDataset.from_list(train_records)
val_hf = HFDataset.from_list(val_records)

print("3. Tokenizing & Caching (Speed Optimization)...")
train_ds = train_hf.map(
    prepare_train_features,
    batched=True,
    remove_columns=train_hf.column_names,
    desc="Processing Train (Dropping Empty Windows)"
)

val_ds = val_hf.map(
    prepare_train_features,
    batched=True,
    remove_columns=val_hf.column_names,
    desc="Processing Val"
)

train_ds.set_format("torch")
val_ds.set_format("torch")

print(f"Final Optimized Training Features: {len(train_ds)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



README.md: 0.00B [00:00, ?B/s]

distractor/train-00000-of-00002.parquet:   0%|          | 0.00/166M [00:00<?, ?B/s]

distractor/train-00001-of-00002.parquet:   0%|          | 0.00/166M [00:00<?, ?B/s]

distractor/validation-00000-of-00001.par(…):   0%|          | 0.00/27.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/90447 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/7405 [00:00<?, ? examples/s]

1. Cleaning Text Data...


Cleaning Data: 100%|██████████| 90447/90447 [00:29<00:00, 3065.01it/s]
Cleaning Data: 100%|██████████| 2000/2000 [00:00<00:00, 2940.15it/s]


2. Converting to HF Datasets (Train: 84933)...
3. Tokenizing & Caching (Speed Optimization)...


Processing Train (Dropping Empty Windows):   0%|          | 0/84933 [00:00<?, ? examples/s]

Processing Val:   0%|          | 0/1875 [00:00<?, ? examples/s]

Final Optimized Training Features: 114291


Evaluation Metric

In [None]:
def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace."""
    import string, re
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()

    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)

    common_tokens = set(pred_tokens) & set(truth_tokens)

    if len(common_tokens) == 0:
        return 0

    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)

    return 2 * (prec * rec) / (prec + rec)

def compute_exact(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

def evaluate_model(model, eval_dataset, dataset_clean):
    device = config.DEVICE
    model.eval()

    id_to_truth = {ex["id"]: ex["answer_text"] for ex in dataset_clean}
    all_predictions = {}

    eval_loader = DataLoader(eval_dataset, batch_size=config.BATCH_SIZE * 2, shuffle=False)

    eval_example_ids = eval_dataset["example_id"]

    feature_idx = 0
    with torch.no_grad():
        for batch in tqdm(eval_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)

            start_logits = outputs.start_logits.cpu().numpy()
            end_logits = outputs.end_logits.cpu().numpy()

            for i in range(len(input_ids)):
                ex_id = eval_example_ids[feature_idx]
                feature_idx += 1

                start_log = start_logits[i]
                end_log = end_logits[i]

                s_idx = np.argmax(start_log)
                e_idx = np.argmax(end_log)

                if e_idx < s_idx or (e_idx - s_idx) > 30:
                     e_idx = s_idx

                score = start_log[s_idx] + end_log[e_idx]
                pred_ids = input_ids[i][s_idx : e_idx + 1]
                pred_text = tokenizer.decode(pred_ids, skip_special_tokens=True)

                if ex_id not in all_predictions or score > all_predictions[ex_id][0]:
                    all_predictions[ex_id] = (score, pred_text)

    f1s, ems = [], []
    for ex_id, (_, pred_text) in all_predictions.items():
        if ex_id in id_to_truth:
            truth = id_to_truth[ex_id]
            f1s.append(compute_f1(pred_text, truth))
            ems.append(compute_exact(pred_text, truth))

    return np.mean(ems), np.mean(f1s)

Baseline

In [None]:
def train_optimized_base():
    print(f"\n=== Training Optimized Base Model on {config.DEVICE} ===")

    model = AutoModelForQuestionAnswering.from_pretrained(
        config.MODEL_NAME,
        use_safetensors=True
    ).to(config.DEVICE)

    model.train()

    optimizer = AdamW(model.parameters(), lr=config.LEARNING_RATE, weight_decay=config.WEIGHT_DECAY)

    train_loader = DataLoader(train_ds, batch_size=config.BATCH_SIZE, shuffle=True, num_workers=0)

    total_steps = (len(train_loader) // config.GRAD_ACCUM_STEPS) * config.EPOCHS

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.1 * total_steps),
        num_training_steps=total_steps
    )

    use_cuda = (config.DEVICE == "cuda")
    scaler = torch.amp.GradScaler('cuda', enabled=use_cuda)

    global_step = 0

    for epoch in range(config.EPOCHS):
        model.train()
        loop = tqdm(train_loader, desc=f"Epoch {epoch+1}")

        current_loss = 0

        for step, batch in enumerate(loop):
            input_ids = batch['input_ids'].to(config.DEVICE)
            attention_mask = batch['attention_mask'].to(config.DEVICE)
            start_positions = batch['start_positions'].to(config.DEVICE)
            end_positions = batch['end_positions'].to(config.DEVICE)

            with torch.amp.autocast(device_type=config.DEVICE, dtype=torch.float16, enabled=use_cuda):
                outputs = model(input_ids, attention_mask=attention_mask,
                                start_positions=start_positions, end_positions=end_positions)
                loss = outputs.loss / config.GRAD_ACCUM_STEPS

            if use_cuda:
                scaler.scale(loss).backward()
            else:
                loss.backward()

            current_loss += loss.item()

            if (step + 1) % config.GRAD_ACCUM_STEPS == 0:
                if use_cuda:
                    scaler.unscale_(optimizer)
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    optimizer.step()

                scheduler.step()
                optimizer.zero_grad()
                global_step += 1

                loop.set_postfix(loss=current_loss * config.GRAD_ACCUM_STEPS)
                current_loss = 0

        em, f1 = evaluate_model(model, val_ds, val_records)
        print(f"Epoch {epoch+1} | EM: {em:.4f} | F1: {f1:.4f}")

    return model

# Run it!
baseline_model = train_optimized_base()


=== Training Optimized Base Model on cuda ===


model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForQuestionAnswering were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 14287/14287 [21:12<00:00, 11.23it/s, loss=2.44]
Evaluating: 100%|██████████| 157/157 [00:18<00:00,  8.55it/s]


Epoch 1 | EM: 0.6107 | F1: 0.7402


Epoch 2: 100%|██████████| 14287/14287 [21:13<00:00, 11.22it/s, loss=1.56]
Evaluating: 100%|██████████| 157/157 [00:18<00:00,  8.64it/s]


Epoch 2 | EM: 0.6283 | F1: 0.7488


Epoch 3: 100%|██████████| 14287/14287 [21:13<00:00, 11.22it/s, loss=2.09]
Evaluating: 100%|██████████| 157/157 [00:18<00:00,  8.62it/s]


Epoch 3 | EM: 0.6416 | F1: 0.7598


Epoch 4: 100%|██████████| 14287/14287 [21:11<00:00, 11.23it/s, loss=0.931]
Evaluating: 100%|██████████| 157/157 [00:18<00:00,  8.64it/s]

Epoch 4 | EM: 0.6517 | F1: 0.7657



