# DAT6004 Week 6
# Activity-1: Fine-tuning an LLM using LoRA on a CPU

## The objective of this worksheet is to demonstrate how large pre-trained language models (LLMs) can be fine-tuned using LoRA to perform a specific downstream task, such as Question Answering (Q-A), using limited computational resources (CPU only).

In [None]:
# !pip install -U peft

In [None]:
# from google.colab import files

## 1. Import libraries

In [1]:
import pandas as pd

In [2]:
# Used to convert data into a Hugging Face Dataset format
from datasets import Dataset

In [3]:
# Used to import tools from Hugging Face to load models, tokenizers, and training utilities
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling

In [4]:
# Used to import LoRA-related tools
from peft import LoraConfig, get_peft_model, TaskType

In [5]:
# Used for tensor manipulation and model training
import torch

## 2. Load dataset

In [6]:
# Loads a CSV file containing question-answer pairs - each row has two columns: "question" and "answer"
df = pd.read_csv('QA_Dataset.csv') 

## 3. Create prompt text column

In [7]:
# Combines question and answer into a single prompt format - mimicing how generative models learn from prompts
df["text"] = "### Question:\n" + df["question"] + "\n\n### Answer:\n" + df["answer"]

## 4. Convert to Hugging Face Dataset

In [8]:
# Converts the pandas DataFrame into a Hugging Face-compatible Dataset object - keeps only the "text" column
dataset = Dataset.from_pandas(df[["text"]])

## 5. Load tokenizer and model

In [9]:
model_name = "google/gemma-2b"

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [11]:
# Sets the padding token to be the same as the End-of-Sequence (EOS) token for compatibility
tokenizer.pad_token = tokenizer.eos_token 

In [12]:
# Add padding to the end of the input, not the beginning
tokenizer.padding_side = "right"

## 6. Tokenize function

In [13]:
# Tokenizes the text into input IDs and attention masks - ensures all inputs are the same length (max_length=512)

def tokenize(batch):
    tokenized = tokenizer(batch["text"], padding="max_length", truncation=True, max_length=512, add_special_tokens=True)
    # Initialize an empty list to store the labels for each tokenized input
    labels = []           
    # Iterate through each tokenized sequence of input IDs
    for input_ids in tokenized["input_ids"]:
        # Create a label sequence where each token ID remains the same, except for padding tokens which are replaced with -100
        label = [(token if token != tokenizer.pad_token_id else -100) for token in input_ids]
        labels.append(label)
        tokenized["labels"] = labels
    return tokenized

## 7. Tokenize the dataset

In [14]:
# Applies the tokenizer to each batch of examples - batched=True enables faster vectorized processing
tokenized_dataset = dataset.map(tokenize, batched=True, batch_size=8)

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [15]:
tokenized_dataset

Dataset({
    features: ['text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 20
})

## 8. Clean & format dataset

In [16]:
# Removes original text column to save memory
tokenized_dataset = tokenized_dataset.remove_columns("text")

In [17]:
# Converts the dataset format to PyTorch tensors, ready for training
tokenized_dataset.set_format("torch")

## 9. Load full model

In [18]:
# Loads the full causal language model for text generation - this is the model we will fine-tune using LoRA
model = AutoModelForCausalLM.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [19]:
for name, module in model.named_modules():
    print(name)


model
model.embed_tokens
model.layers
model.layers.0
model.layers.0.self_attn
model.layers.0.self_attn.q_proj
model.layers.0.self_attn.k_proj
model.layers.0.self_attn.v_proj
model.layers.0.self_attn.o_proj
model.layers.0.mlp
model.layers.0.mlp.gate_proj
model.layers.0.mlp.up_proj
model.layers.0.mlp.down_proj
model.layers.0.mlp.act_fn
model.layers.0.input_layernorm
model.layers.0.post_attention_layernorm
model.layers.1
model.layers.1.self_attn
model.layers.1.self_attn.q_proj
model.layers.1.self_attn.k_proj
model.layers.1.self_attn.v_proj
model.layers.1.self_attn.o_proj
model.layers.1.mlp
model.layers.1.mlp.gate_proj
model.layers.1.mlp.up_proj
model.layers.1.mlp.down_proj
model.layers.1.mlp.act_fn
model.layers.1.input_layernorm
model.layers.1.post_attention_layernorm
model.layers.2
model.layers.2.self_attn
model.layers.2.self_attn.q_proj
model.layers.2.self_attn.k_proj
model.layers.2.self_attn.v_proj
model.layers.2.self_attn.o_proj
model.layers.2.mlp
model.layers.2.mlp.gate_proj
model.l

## 10. Prepare for LoRA (CPU version, no 4-bit)

In [29]:
lora_config = LoraConfig(
    r=8,                                  # Rank of the low-rank adapters
    lora_alpha=16,                        # Scaling factor for LoRA updates
    target_modules=["q_proj", "v_proj"],  # Specific layers to apply LoRA to - module-names are model specific
    lora_dropout=0.1,                     # Applies dropout to only the LoRA weights to control regularization
    bias="none",                          # Controls whether the bias terms in the model are also adapted
    task_type=TaskType.CAUSAL_LM          # Tells PEFT the task type - determines which parts of the model to freeze/unfreeze
)

In [30]:
# Adds LoRA adapters to the model without changing the original weights
model = get_peft_model(model, lora_config)

In [31]:
model.print_trainable_parameters()

trainable params: 921,600 || all params: 2,507,094,016 || trainable%: 0.0368


## 11. Data collator and training arguments

In [22]:
# Assembles and formats batches during training - pads sequences, aligns labels, formats inputs etc.
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [23]:
# Hyper parameter sapce
training_args = TrainingArguments(
    output_dir="./gemma-lora-qa",   # Where to save model checkpoints
    per_device_train_batch_size=1,  # Controls how many examples are included in a single training step
    gradient_accumulation_steps=2,  # Controls how many batches the model processes before updating the weights
    learning_rate=0.0001,           # Step size for weight updates during training
    num_train_epochs=1,             # Controls how many times to go over the full training dataset
    logging_steps=1,                # Controls logging training loss and other metrics every N steps
    save_steps=10,                  # Controls saving a model checkpoint every N steps
    save_total_limit=1,             # Keeps only the most recent N checkpoints to save disk space
    remove_unused_columns=False,    # Prevents Hugging Face from automatically dropping any unused columns in the dataset
    save_safetensors=False,         # Whether to save model weights in .safetensors format - set True for safe serialization
    report_to="none"                # Whether or not log to experiment tracking systems
)

## 12. Train the model

In [24]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
# downgrade numpy if you get an error in Colab
# !pip install numpy==1.26.4

In [25]:
trainer.train()



Step,Training Loss
1,2.1957
2,2.8579
3,2.1227
4,2.4223
5,2.1555
6,2.5994
7,2.6479
8,2.8246
9,2.2366
10,2.4757


TrainOutput(global_step=10, training_loss=2.453838038444519, metrics={'train_runtime': 351.6965, 'train_samples_per_second': 0.057, 'train_steps_per_second': 0.028, 'total_flos': 121823601623040.0, 'train_loss': 2.453838038444519, 'epoch': 1.0})

## 13. Save the trained model

In [26]:
model.save_pretrained("./gemma-lora-qa/lora_adapter")

In [27]:
tokenizer.save_pretrained("./gemma-lora-qa/tokenizer")

('./gemma-lora-qa/tokenizer\\tokenizer_config.json',
 './gemma-lora-qa/tokenizer\\special_tokens_map.json',
 './gemma-lora-qa/tokenizer\\tokenizer.model',
 './gemma-lora-qa/tokenizer\\added_tokens.json',
 './gemma-lora-qa/tokenizer\\tokenizer.json')