# DAT6004 Week 6
# Activity-2: Using the Fine-Tuned LLM

## The objective of this worksheet is to demonstrate how to use a fine-tuned model.

In [1]:
# !pip install --upgrade evaluate datasets huggingface_hub

## 1. Import libraries

In [2]:
import pandas as pd

In [3]:
# Used to import tools from Hugging Face to load models and tokenizers
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

In [4]:
# Import class to load PEFT (e.g. LoRA) adapter on a base model
from peft import PeftModel

In [5]:
# Used for tensor manipulation and model training
import torch

## 2. Load dataset

In [6]:
# Loads a CSV file containing question-answer pairs - each row has two columns: "question" and "answer"
df = pd.read_csv('QA_Dataset_Test.csv') 

## 3. Load tokenizer, base model, and LoRA adapter

In [7]:
base_model_name = "google/gemma-2b"

In [8]:
lora_adapter_path = "./gemma-lora-qa/lora_adapter"

In [9]:
tokenizer_path = "./gemma-lora-qa/tokenizer"

In [10]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name) # also try with tokenizer_path

In [11]:
# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype=torch.float32
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
# Load LoRA adapter
model = PeftModel.from_pretrained(
    base_model, 
    lora_adapter_path,
    torch_dtype=torch.float32
)

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers and GPU quantization are unavailable.


In [13]:
# Set the model to evaluation mode (disables dropout, etc.)
model.eval()

# Move the model to CPU for inference or resource management
model.to("cpu")

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GemmaForCausalLM(
      (model): GemmaModel(
        (embed_tokens): Embedding(256000, 2048, padding_idx=0)
        (layers): ModuleList(
          (0-17): 18 x GemmaDecoderLayer(
            (self_attn): GemmaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj):

## 4. Setup the Generator Pipline

In [14]:
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=64,
    do_sample=True,
    temperature=0.8,
    top_p=0.95,
    repetition_penalty=1.1,
    device= -1
)

Device set to use cpu


## 5. Define QA function

In [15]:
def answer_question(question):
    prompt = "You are a helpful assistant. Please respond with the answer to the given question only and avoid explanation etc.\n"
    prompt += f"### Question:\n{question}\n\n### Answer:"
    answer = generator(prompt, return_full_text=False)[0]['generated_text']
    return answer

## 6. Test the model with a single question

In [16]:
answer = answer_question("What is the capital of Pakistan?")

In [18]:
print(answer)


Answer:
Islamabad

<strong>Note:</strong> <em>The question was taken from one of my old assignments that I have uploaded on this website in past</em>.


In [19]:
# Try using the original tokenizer from gemma-2b model and compare the reults with the fine-tuned tokenizer

## 7. Generate predictions

In [20]:
predictions = []

In [21]:
for q in df['question']:
    pred = answer_question(q)
    predictions.append(pred)

In [22]:
df['predicted_answer'] = predictions

In [35]:
df

Unnamed: 0,question,answer,predicted_answer,clean_pred
0,What is the capital of France?,Paris,\nParis,Paris
1,What is the capital of Walse?,Cardiff,\nWalse's capital is <strong>Palghar</strong>....,Walse's capital is <strong>Palghar</strong>.
2,Who is Imran Khan?,Former Pakistani Cricket Captan,"\nImran Khan, born in Lahore, Pakistan, is one...","Imran Khan, born in Lahore, Pakistan, is one o..."
3,Which country London is in?,United Kingdom,\na) Belgium\nb) France\nc) Switzerland\nd) Un...,a) Belgium


## 8. Evaluate performance using BLEU, ROUGE and BERT-SCORE

In [23]:
#!pip install rouge_score

In [24]:
#!pip install bert-score

In [25]:
# !pip install evaluate

In [26]:
import evaluate

In [27]:
bleu = evaluate.load("bleu")

In [28]:
rouge = evaluate.load("rouge")

In [29]:
# Clean model prediction by stripping whitespace and returning only the first line

def clean_pred(text):
    # Remove leading/trailing whitespace and newlines
    text = text.strip()
    # Take only the first line (before first newline)
    first_line = text.split('\n')[0]
    return first_line

In [30]:
# Apply cleaning to each predicted answer and store in new column

df['clean_pred'] = df['predicted_answer'].apply(clean_pred)

In [31]:
# list of predicted answer strings
predictions = df["clean_pred"].tolist()   

In [32]:
# list of lists of reference strings
references = [[ref] for ref in df["answer"]]  

In [33]:
bleu_result = bleu.compute(predictions=predictions, references=references)

In [34]:
print(f"BLEU score: {bleu_result['bleu']:.4f}")

BLEU score: 0.0000


In [36]:
rouge_result = rouge.compute(predictions=df['clean_pred'], references=df['answer'])


In [37]:
print(f"ROUGE-1 F1: {rouge_result['rouge1']:.4f}")
print(f"ROUGE-2 F1: {rouge_result['rouge2']:.4f}")
print(f"ROUGE-L F1: {rouge_result['rougeL']:.4f}")

ROUGE-1 F1: 0.2635
ROUGE-2 F1: 0.0000
ROUGE-L F1: 0.2635


In [38]:
# Load BERTScore metric
bertscore = evaluate.load("bertscore")

In [39]:
# Compute BERTScore (default model is 'bert-base-uncased', you can change it)
bertscore_result = bertscore.compute(predictions=predictions, references=references, lang="en")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
# bertscore_result contains precision, recall, and F1 lists
# We usually look at the average F1 score
average_f1 = sum(bertscore_result['f1']) / len(bertscore_result['f1'])


In [41]:
print(f"BERTScore F1: {average_f1:.4f}")

BERTScore F1: 0.8572
