# 🧠 Deep Learning Project 2: LoRA + RoBERTa on AGNEWS

## Step 1: Install Required Libraries

In [None]:
!pip install -q transformers datasets peft accelerate torchsummary
!pip install transformers datasets evaluate accelerate peft trl bitsandbytes
!pip install nvidia-ml-py3



## Step 2: Imports & Seed Setup

In [None]:
import torch
import random
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, TaskType
from torchsummary import summary
from sklearn.metrics import accuracy_score
import pandas as pd
import pickle

# Reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


## Step 3: Load AGNEWS Dataset

In [None]:
dataset = load_dataset("ag_news")
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

tokenized_datasets = dataset.map(preprocess_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets.set_format("torch")
train_dataset = tokenized_datasets["train"]
test_dataset = tokenized_datasets["test"]


Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

##Step 4 : Setup RoBERTa with LoRA

In [None]:
base_model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=4).to(device)

# # LoRA Config
# lora_config = LoraConfig(
#     task_type=TaskType.SEQ_CLS,
#     inference_mode=False,
#     r=11,  # old: 8
#     lora_alpha=16,  # strength
#     lora_dropout=0.1,
#     target_modules=["query", "value"]  # target attention sublayers
# )

from peft import AdaLoraConfig

total_step = len(train_dataset) // 32 * 6

lora_config = AdaLoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,                        # initial rank
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["query", "value"],
    beta1=0.85,                 # for AdamW
    beta2=0.95,
    tinit=200,                 # warmup steps before rank adaptation
    tfinal=1000,               # final rank adaptation step
    deltaT=10,                 # how often to update the ranks
    orth_reg_weight=0.5,       # orthogonality regularization
    total_step=total_step,           # set this properly based on your training steps
    rank_pattern={},           # (optional) for layer-wise initial ranks
    init_r=8,
    target_r=1,
    # adapt_lr=True
)



model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 888,772 || all params: 125,537,504 || trainable%: 0.7080


## Step 5: Train the Model

In [None]:
from transformers import EarlyStoppingCallback
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=6, # old: 3
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_dir="./logs",
    save_total_limit=1,
    report_to="wandb",
    metric_for_best_model="accuracy",
    greater_is_better=True,
    logging_steps=50,
)

# def compute_metrics(p):
#     preds = p.predictions.argmax(axis=1)
#     return {"accuracy": (preds == p.label_ids).astype(float).mean().item()}

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mlz3258[0m ([33mlz3258-new-york-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2502,0.219553,0.926711
2,0.2138,0.199607,0.931316
3,0.198,0.192919,0.936184
4,0.1827,0.189883,0.936447
5,0.1699,0.187477,0.938158
6,0.1795,0.18412,0.938684


TrainOutput(global_step=22500, training_loss=0.21581315845913357, metrics={'train_runtime': 5263.3076, 'train_samples_per_second': 136.796, 'train_steps_per_second': 4.275, 'total_flos': 9.570461810688e+16, 'train_loss': 0.21581315845913357, 'epoch': 6.0})

##  Step 6: Final Evaluation

In [None]:
from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm

def evaluate_model(inference_model, dataset, labelled=True, batch_size=8, data_collator=None):
    """
    Evaluate a PEFT model on a dataset.

    Args:
        inference_model: The model to evaluate.
        dataset: The dataset (Hugging Face Dataset) to run inference on.
        labelled (bool): If True, the dataset includes labels and metrics will be computed.
                         If False, only predictions will be returned.
        batch_size (int): Batch size for inference.
        data_collator: Function to collate batches. If None, the default collate_fn is used.

    Returns:
        If labelled is True, returns a tuple (metrics, predictions)
        If labelled is False, returns the predictions.
    """
    # Create the DataLoader
    eval_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inference_model.to(device)
    inference_model.eval()

    all_predictions = []
    if labelled:
        metric = evaluate.load('accuracy')

    # Loop over the DataLoader
    for batch in tqdm(eval_dataloader):
        # Move each tensor in the batch to the device
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = inference_model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        all_predictions.append(predictions.cpu())

        if labelled:
            # Expecting that labels are provided under the "labels" key.
            references = batch["labels"]
            metric.add_batch(
                predictions=predictions.cpu().numpy(),
                references=references.cpu().numpy()
            )

    # Concatenate predictions from all batches
    all_predictions = torch.cat(all_predictions, dim=0)

    if labelled:
        eval_metric = metric.compute()
        print("Evaluation Metric:", eval_metric)
        return eval_metric, all_predictions
    else:
        return all_predictions

In [None]:
eval_results = trainer.evaluate()
print("Final Evaluation Accuracy:", eval_results["eval_accuracy"])



Final Evaluation Accuracy: 0.9386842105263158


## Step 7: Verify Trainable Parameters

In [None]:
# Summarize trainable parameters (verify < 1 million)
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total Trainable Parameters: {trainable_params}")


Total Trainable Parameters: 888772


## Step 8: Save Model

In [None]:
model.save_pretrained("./lora_roberta_agnews")
tokenizer.save_pretrained("./lora_roberta_agnews")


('./lora_roberta_agnews/tokenizer_config.json',
 './lora_roberta_agnews/special_tokens_map.json',
 './lora_roberta_agnews/vocab.json',
 './lora_roberta_agnews/merges.txt',
 './lora_roberta_agnews/added_tokens.json',
 './lora_roberta_agnews/tokenizer.json')

In [None]:
from google.colab import files
uploaded = files.upload()

Saving test_unlabelled.pkl to test_unlabelled.pkl


In [None]:
from datasets import Dataset
from torch.utils.data import DataLoader

# # Load dataset object
# with open("/kaggle/input/deep-learning-spring-2025-project-2/test_unlabelled.pkl", "rb") as f:
#     test_dataset = pickle.load(f)

test_dataset = pd.read_pickle("test_unlabelled.pkl")

# Convert to HuggingFace Dataset (already is, but this helps formatting)
test_dataset = Dataset.from_dict({"text": test_dataset["text"]})

# Tokenize function
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

# Apply tokenizer
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

# Create PyTorch DataLoader for batching
test_dataloader = DataLoader(tokenized_test_dataset, batch_size=64)

# Prediction loop
model.eval()
all_predictions = []

with torch.no_grad():
    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, dim=-1)
        all_predictions.extend(preds.cpu().numpy())

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

In [None]:
import os
output_dir = "inference_output"
os.makedirs(output_dir, exist_ok=True)
df = pd.DataFrame({
    "ID": list(range(len(all_predictions))),   # ID ✅
    "label": all_predictions
})
df.to_csv("submission1.csv", index=False)
print("✅ Batched predictions complete. Saved to submission.csv.")
files.download("submission1.csv")

✅ Batched predictions complete. Saved to submission.csv.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import shutil

# 压缩文件夹 lora_roberta_agnews => lora_roberta_agnews.zip
shutil.make_archive("lora_roberta_agnews", "zip", "./lora_roberta_agnews")


from google.colab import files

# 下载到本地
files.download("lora_roberta_agnews.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>