### Matthew Poncini
**CSE 590 Special Topics: GenAI**
**Homework 2**

### Step 0: Environment Setup and Library Installation


helps prevent memory fragmentation issues when loading large models or training with quantized weights.

In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

Install Libraries

In [None]:
!pip install peft

In [None]:
!pip install trl

In [None]:
!pip install rouge_score

In [None]:
!pip install evaluate

In [None]:
!pip install bitsandbytes

In [None]:
!pip install -U bitsandbytes

In [None]:
!pip install -U transformers

Checks if GPU can be ran

In [None]:
import torch

torch.cuda.is_available()

Log-in to HuggingFace

In [None]:
HF_TOKEN = "insert_hugging_face_token"
!huggingface-cli login --token $HF_TOKEN

### Step 1:  Choose a downstream task and select a suitable dataset for fine-tuning. From the dataset, sample 1,000 examples for training and a separate, non-overlapping 300 examples for testing.

Chosen dataset for your fine-tuning task: finance-alpaca

https://huggingface.co/datasets/gbharti/finance-alpaca

In [None]:
dataset = "gbharti/finance-alpaca"

In [None]:
from datasets import load_dataset

dataset = load_dataset("gbharti/finance-alpaca")

Make dataset viewable

In [None]:
import pandas as pd

df = dataset["train"].to_pandas().iloc[:1300]

df

Select 300 test sample from dataset

In [None]:
test_sample = df.iloc[1000:1300]

Select 1,000 training samples from dataset

In [None]:
train_sample = df.iloc[0:1000]

### Step 2: Select a pre-trained language model to work with.

Chosen pre-trained language model: meta-llama/Llama-2-7b-hf

In [None]:
model_name = "meta-llama/Llama-2-7b-hf"

sets up a configuration for loading a quantized version of a large language model using the BitsAndBytes library from Hugging Face

In [None]:
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

 loads and configures the tokenizer associated with meta-llama/Llama-2-7b-hf

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

 loads a quantized large language model onto a GPU using Hugging Face Transformers and BitsAndBytes, ready for fine-tuning or inference, while minimizing memory usage.

In [None]:
from transformers import AutoModelForCausalLM
from transformers import BitsAndBytesConfig

model  = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    use_cache=True
)

Low-Rank Adaptation configuration, fine-tunes only a small number of parameters while freezing the rest of the large model. This significantly reduces memory and compute requirements.

In [None]:
from peft import LoraConfig

lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

### Step 3: Evaluate the pre-trained model on the selected dataset to establish a baseline using an appropriate metric.

Evaluate Model Function

In [None]:
import pandas as pd
from evaluate import load
from tqdm import tqdm
import torch

# makes the generated outputs clean for ROUGE evaluation and display.
def extract_response(text):
    if "### Response:" in text:
        return text.split("### Response:")[-1].strip()
    return text.strip()

# Evaluates a pretrained model on a test set using a generation + ROUGE scoring loop.
def evaluate_model(model, tokenizer, prompts, references, batch_size=1, max_new_tokens=64):
    pretrained_metric = load("rouge")
    predictions = []
    records = []

    for i in tqdm(range(0, len(prompts), batch_size)):
        batch_prompts = prompts[i:i+batch_size]
        batch_refs = references[i:i+batch_size]

        # Tokenizes input prompts with truncation and padding
        inputs = tokenizer(
            batch_prompts,
            return_tensors="pt",
            padding=True,
            truncation=True
        ).to(model.device)

        #  produce text responses (ensures no gradients are computed — saving memory)
        with torch.no_grad():
            outputs = model.generate(
              input_ids=inputs["input_ids"],
              attention_mask=inputs["attention_mask"],
              do_sample=True,
              temperature=0.7,
              top_p=0.9,
              max_new_tokens=64,
              pad_token_id=tokenizer.eos_token_id
            )

        # Converts model token outputs back into strings.
        batch_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        batch_preds = [extract_response(p) for p in batch_preds]


        # Metric Calculation and Result Collection
        for prompt_text, pred, ref in zip(batch_prompts, batch_preds, batch_refs):
            pretrained_metric.add(prediction=pred, reference=ref)
            predictions.append(pred)
            records.append({
                "prompt": prompt_text,
                "reference": ref,
                "prediction": pred
            })

    pretrained_results_df = pd.DataFrame(records)
    return pretrained_metric.compute(), predictions, pretrained_results_df


Evaluate Execution:

In [None]:
prompts = [
    f"### Instruction:\n{row['instruction']}\n\n### Response:"
    for _, row in test_sample.iterrows()
]
references = list(test_sample["output"])

baseline_scores, baseline_outputs, pretrained_results_df = evaluate_model(
    model,
    tokenizer,
    prompts,
    references
)

In [None]:
print("ROUGE Evaluation:")
for metric, score in baseline_scores.items():
    print(f"{metric}: {score:.4f}")

In [None]:
pretrained_results_df

### Step 4: Fine-tune the model on the chosen dataset.

Memory Cleanup and Re-initialization

In [None]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()

Re-loads the tokenizer and re-downloads the dataset and selects the first 1,000 samples from the training set for fine-tuning.

(Colab Notebook is restarted)

In [None]:
import os
from transformers import AutoTokenizer
from datasets import load_dataset

model_name = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

data = load_dataset("gbharti/finance-alpaca")
train_sample = data["train"].select(range(1000))

Reconfigure Quantization for Training

In [None]:
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

In [None]:
from transformers import AutoModelForCausalLM

foundation_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    use_cache=False,
    trust_remote_code=True
)

In [None]:
from peft import LoraConfig


lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

Formats each training example into a structured prompt-response pair

In [None]:
def build_prompt_completion(example):
    if example["input"]:
        prompt = f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:"
    else:
        prompt = f"### Instruction:\n{example['instruction']}\n\n### Response:"
    return {
        "prompt": prompt,
        "completion": example["output"]
    }

In [None]:
train_sample = train_sample.map(build_prompt_completion)

epares the dataset for training by tokenizing each example's prompt which includes both instruction and expected response.

Truncates sequences longer than 512 tokens and pads shorter ones to that length for consistent batch shapes

In [None]:
def tokenize_fn(example):
    tokenized = tokenizer(
        example["prompt"],
        truncation=True,
        padding="max_length",
        max_length=512
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

In [None]:
import os

output_directory = os.path.join(".", "peft_lab_outputs")

Defines the key hyperparameters and behaviors for fine-tuning the model using Hugging Face's Trainer or SFTTrainer

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./peft_lab_outputs",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    learning_rate=2e-4,
    num_train_epochs=1,
    logging_steps=1,
    save_strategy="no"
)

Creates a data collator that dynamically batches tokenized examples for training.

In [None]:
from transformers import DataCollatorForLanguageModeling


data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

SFTTrainer is a wrapper around Hugging Face’s Trainer that supports parameter-efficient fine-tuning (PEFT) via LoRA

In [None]:
from trl import SFTTrainer
from transformers import DataCollatorForLanguageModeling

trainer = SFTTrainer(
    model=foundation_model,
    args=training_args,
    train_dataset=train_sample,
    peft_config=lora_config,
    data_collator=data_collator
)

In [None]:
trainer.train()

In [None]:
# Define output directory for saving
output_directory = "./peft_outputs"
peft_model_path = os.path.join(output_directory, "lora_model")

trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

### Step 5: Evaluate the fine-tuned model’s performance using the same metric.

In [None]:
import gc
import torch
gc.collect()
torch.cuda.empty_cache()

In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [None]:
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    llm_int8_enable_fp32_cpu_offload=True
)

In [None]:
peft_model_path = "./peft_outputs/lora_model"

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(peft_model_path)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,  # valid for GPU
    bnb_4bit_quant_type="nf4",             # use 'nf4' for GPU compatibility
    llm_int8_enable_fp32_cpu_offload=False # must be False when not offloading
)


In [None]:
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer

# Load model + tokenizer from saved adapter
loaded_model = AutoPeftModelForCausalLM.from_pretrained(
    peft_model_path,
    device_map="auto",
    is_trainable=False,
    quantization_config=bnb_config,
    torch_dtype=torch.float16
)

In [None]:
from datasets import load_dataset

test_sample = load_dataset("gbharti/finance-alpaca", split="train[1000:1300]")

prompts = [
    f"Instruction: {ex['instruction']}\nInput: {ex['input']}\nResponse:"
    for ex in test_sample
]
references = [ex["output"] for ex in test_sample]

In [None]:
from evaluate import load
from tqdm import tqdm
import pandas as pd

metric = load("rouge")  # or use rouge_scorer if preferred
results = []
all_outputs = []

batch_size = 4
loaded_model.eval()

for i in tqdm(range(0, len(prompts), batch_size)):
    batch_prompts = prompts[i:i+batch_size]
    batch_refs = references[i:i+batch_size]

    inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True, truncation=True).to(loaded_model.device)

    with torch.no_grad():
        outputs = loaded_model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            max_new_tokens=128,
            pad_token_id=tokenizer.eos_token_id
        )

    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    all_outputs.extend(decoded_outputs)

    for prompt_text, pred, ref in zip(batch_prompts, decoded_outputs, batch_refs):
        metric.add(prediction=pred, reference=ref)
        results.append({
            "prompt": prompt_text,
            "reference": ref,
            "prediction": pred
        })

# Create final DataFrame
results_df = pd.DataFrame(results)

# Compute ROUGE scores
rouge_scores = metric.compute()

In [None]:
results_df

In [None]:
rouge_scores