In [None]:
#Team Name - SalaarTheRiser
#Team Members - Hashmmath Shaik, D Vivek Reddy, Snigdha Srivastva
#Net-Id - hs5544, vd2438, ss19776

!pip install transformers datasets evaluate accelerate peft trl bitsandbytes  # install Transformers, Datasets, PEFT and related dependencies
!pip install nvidia-ml-py3  # install NVIDIA GPU monitoring library

import os
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from peft import LoraConfig, get_peft_model, PeftModel, TaskType  # PEFT (LoRA) adapters
from datasets import load_dataset, Dataset  # Hugging Face Datasets library
from sklearn.metrics import accuracy_score  # accuracy metric for evaluation

# Loading the AG News dataset and tokenizer
base_model = "roberta-base"  # identifier for the pretrained model
dataset = load_dataset("ag_news", split="train")  # loading the training split of AG News
tokenizer = RobertaTokenizer.from_pretrained(base_model)  # instantiate the RoBERTa tokenizer

# Preprocessing function for tokenization
def preprocess(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=512)  # tokenize each example to max length 512

# Applying preprocessing to dataset
tokenized_dataset = dataset.map(preprocess, batched=True, remove_columns=["text"])  # tokenization in batches, drop raw text
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")  # renaming label column for Trainer compatibility

# Inspect the label names
num_labels = dataset.features["label"].num_classes  # should be 4 for AG News
class_names = dataset.features["label"].names  # list of class names
print(f"Number of labels: {num_labels}")  # displaying number of classes
print(f"Label names: {class_names}")  # displaying class names

# Splitting into training and validation (evaluation) set
split_datasets = tokenized_dataset.train_test_split(test_size=640, seed=42)  # reserving 640 samples for validation
train_dataset = split_datasets["train"]  # training portion
eval_dataset  = split_datasets["test"]  # validation portion

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")  # pad batches dynamically and return PyTorch tensors

# This code installs the necessary libraries, loads and tokenizes the AG News training data
# using a RoBERTa tokenizer (truncating/padding to 512 tokens), renames the label column for
# compatibility with Hugging Face’s Trainer API, prints the number and names of the classes,
# splits the tokenized dataset into a large training set and a 640‑sample validation set for
# early stopping and hyperparameter tuning, and finally configures a data collator that
# dynamically pads each batch and returns properly formatted PyTorch tensors for downstream
# LoRA‑based fine‑tuning.

In [None]:
# Loading pre-trained RoBERTa-base model for sequence classification
model = RobertaForSequenceClassification.from_pretrained(base_model, num_labels=num_labels)  # initializing model with correct number of output labels

# Freezing all base model parameters
for param in model.roberta.parameters():
    param.requires_grad = False  # disabling gradient updates for the frozen backbone

# Setting up LoRA configuration for query and value projections
peft_config = LoraConfig(
    r=2,                      # low-rank dimension
    lora_alpha=4,             # scaling factor for adapter updates
    lora_dropout=0.05,        # dropout rate on adapter layers
    bias="none",              # do not adapt bias terms
    target_modules=["query", "value"],  # inserting adapters into these projections
    task_type=TaskType.SEQ_CLS  # specifying sequence classification task
)

# Applying LoRA to the model
peft_model = get_peft_model(model, peft_config)  # wrapping the frozen model with LoRA adapters

# Verifying which parameters are trainable
peft_model.print_trainable_parameters()  # prints a summary of adapter vs. backbone params

# Calculating the total trainable parameters and ensure it's < 1e6
trainable_params = sum(p.numel() for p in peft_model.parameters() if p.requires_grad)  # counting only parameters requiring gradients
total_params = sum(p.numel() for p in peft_model.parameters())  # counting all parameters
print(f"Total trainable parameters: {trainable_params}")
print(f"Total model parameters: {total_params}")

# This part of the code loads a RoBERTa‑base sequence classification model with the appropriate
# number of labels and then freezes its entire backbone so that only newly introduced
# adapter parameters and the classification head can be trained. It configures a
# LoRA adapter via LoraConfig (rank 2, alpha 4, dropout 0.05, no bias adaptation)
# targeting the “query” and “value” projection layers, applies these adapters to the model
# with get_peft_model, and prints which parameters remain trainable. Finally, it computes
# and displays the total number of trainable parameters versus the overall parameter count
# to verify that the adapter‑only fine‑tuning stays well below one million trainable weights.

In [None]:
from transformers import EarlyStoppingCallback, Trainer, TrainingArguments  # importing Trainer API and early stopping callback

# Defining accuracy metric computation
def compute_metrics(eval_pred):
    logits, labels = eval_pred.predictions, eval_pred.label_ids  # unpacking predictions and true labels
    preds = logits.argmax(axis=-1)  # converting logits to predicted class indices
    acc = accuracy_score(labels, preds)  # computing accuracy
    return {"accuracy": acc}  # returning in expected dict format

# Setting up training arguments
output_dir = "results_lora"  # directory to save checkpoints and logs
training_args = TrainingArguments(
    output_dir=output_dir,               # where to store model outputs
    report_to=None,                      # disabling external logging (e.g., WandB, HF Hub)
    eval_strategy="epoch",               # running evaluation at end of each epoch
    save_strategy="epoch",               # saving model checkpoint at end of each epoch
    learning_rate=5e-6,                  # initial learning rate for AdamW
    num_train_epochs=1,                  # maximum number of training epochs
    per_device_train_batch_size=32,      # batch size per GPU/CPU for training
    per_device_eval_batch_size=64,       # batch size per GPU/CPU for evaluation
    dataloader_num_workers=4,            # number of subprocesses for data loading
    load_best_model_at_end=True,         # after training, load checkpoint with best eval metric
    metric_for_best_model="eval_accuracy",  # metric to compare for best checkpoint
    greater_is_better=True,              # higher eval_accuracy is better
    logging_steps=100,                   # log training metrics every 100 steps
    optim="adamw_torch",                 # use PyTorch’s AdamW implementation
    weight_decay=0.01,                   # weight decay coefficient for regularization
    push_to_hub=False                    # do not push model to the Hugging Face Hub
)

# Initializing the Trainer with our LoRA-adapted model
trainer = Trainer(
    model=peft_model,                    # the LoRA-wrapped model to train
    args=training_args,                  # training hyperparameters
    train_dataset=train_dataset,         # dataset for training
    eval_dataset=eval_dataset,           # dataset for evaluation
    data_collator=data_collator,         # function to batch and pad data
    compute_metrics=compute_metrics,     # function to compute evaluation metrics
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # stop training after 3 epochs of no improvement
)

# This code cell configures and initializes the Hugging Face Trainer for
# LoRA‑based fine‑tuning: it defines an accuracy metric, specifies all the training
# hyperparameters (learning rate, batch sizes, checkpointing, early stopping, etc.)
# via TrainingArguments, and then instantiates Trainer with the LoRA‑adapted model, datasets,
# data collator, metric function, and an EarlyStoppingCallback set to halt training if
# validation accuracy does not improve for three consecutive evaluations.

In [None]:
# Train the model
train_result = trainer.train()


In [None]:
# Evaluating the model on the validation set
eval_metrics = trainer.evaluate(eval_dataset=eval_dataset)
print(f"Validation Accuracy: {eval_metrics['eval_accuracy']:.4f}")

# This code invokes the Trainer’s built‑in evaluation routine on the held‑out eval_dataset,
# computing all configured metrics (here, accuracy) without further training, and then
# prints out the validation accuracy in a formatted string to four decimal places.

In [None]:
print(f"Final Test Accuracy: {eval_metrics['eval_accuracy']:.4f}")

# This code line prints out the final evaluation accuracy obtained from the last call to
# trainer.evaluate, retrieving the "eval_accuracy" value from the eval_metrics dictionary
# and formatting it to four decimal places, thereby summarizing the model’s validation
# performance in a clear, human‑readable form.

In [None]:
# Loading unlabeled test data
unlabelled_dataset = pd.read_pickle("test_unlabelled.pkl")  # loading the pickled test DataFrame

# If it's a pandas DataFrame, convert to Hugging Face Dataset for easy batch processing
if isinstance(unlabelled_dataset, pd.DataFrame):
    test_dataset = Dataset.from_pandas(unlabelled_dataset)  # wrapping the DataFrame in HF Dataset
else:
    test_dataset = unlabelled_dataset  # already a Dataset

# Preprocessing the test dataset (tokenize)
test_dataset = test_dataset.map(preprocess, batched=True, remove_columns=["text"])  # apply the same tokenization

# Running inference in batches
predictions = trainer.predict(test_dataset)  # getting model outputs on the test set
pred_labels = predictions.predictions.argmax(axis=-1)  # extracting predicted class indices

# Preparing submission dataframe
df_submission = pd.DataFrame({
    "ID": range(len(pred_labels)),  # assigning sequential IDs
    "Label": pred_labels            # attaching predicted labels
})

# Saving to CSV (no index, just two columns)
submission_path = os.path.join(output_dir, "Project-2-Prediction-7.csv")  # defining file path
df_submission.to_csv(submission_path, index=False)  # writing to submission file

print(f"Inference complete. Predictions saved to {submission_path}")

# This part of the code loads the pickled test dataset (converting it to a Hugging Face Dataset if necessary),
# applies the same tokenization pipeline used during training, runs batch inference with the trained
# LoRA‑adapted model to obtain logits and convert them to class predictions, constructs a pandas DataFrame
# with sequential IDs and predicted labels, writes the submission CSV in the designated output directory,
# and finally prints a confirmation message indicating where the predictions were saved.