In [1]:
# pip installs
# !pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124
# !pip install -q --upgrade requests==2.32.3 bitsandbytes==0.46.0 transformers==4.48.3 accelerate==1.3.0 datasets==3.2.0 peft==0.14.0 trl==0.14.0 matplotlib wandb

In [2]:
## imports

import os
import re
import math
from tqdm import tqdm
from google.colab import userdata
from huggingface_hub import login
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, set_seed, BitsAndBytesConfig
from datasets import load_dataset, Dataset, DatasetDict
import wandb
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
## Models
BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"
PROJECT_NAME = "thilina-llama-3.1-tuned"
HF_USER = "12thilina"

In [4]:
## Dataset
DATASET_NAME = f"12thilina/sample_amazon_reviews"
MAX_SEQUENCE_LENGTH = 182

In [5]:
## Run name for saving the model in the hub
RUN_NAME =  f"{datetime.now():%Y-%m-%d_%H.%M.%S}"
PROJECT_RUN_NAME = f"{PROJECT_NAME}-{RUN_NAME}"
HUB_MODEL_NAME = f"{HF_USER}/{PROJECT_RUN_NAME}"

In [6]:
## Hyperparameters for QLoRA
LORA_R = 8
LORA_ALPHA = 16
TARGET_MODULES = ["q_proj", "v_proj", "k_proj", "o_proj"]
LORA_DROPOUT = 0.1 ## Avoid overfitiing
QUANT_4_BIT = True

In [7]:
## Hyperparameters for Training
EPOCHS = 1
BATCH_SIZE = 4 ## This depends on the GPU box. In TPU 4, we can use only 4
GRADIENT_ACCUMULATION_STEPS = 1
LEARNING_RATE = 1e-4
LR_SCHEDULER_TYPE = 'cosine'
WARMUP_RATIO = 0.03
OPTIMIZER = "paged_adamw_32bit"

In [8]:
## Admin config
STEPS = 50
SAVE_STEPS = 2000 ## SAVE_STEPS is how often it will upload to the hub
LOG_TO_WANDB = True

In [9]:
HUB_MODEL_NAME

'12thilina/thilina-llama-3.1-tuned-2025-09-11_07.10.37'

# More on Optimizers

https://huggingface.co/docs/transformers/main/en/perf_train_gpu_one#optimizer-choice

The most common is Adam or AdamW (Adam with Weight Decay).  
Adam achieves good convergence by storing the rolling average of the previous gradients; however, it adds an additional memory footprint of the order of the number of model parameters.


In [10]:
## Log in to HuggingFace
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [11]:
## Log in to Weights & Biases
wandb_api_key = userdata.get('WANDB_API_KEY')
os.environ["WANDB_API_KEY"] = wandb_api_key
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mlatjkansa[0m ([33mlatjkansa-uoc[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [12]:
## Configure Weights & Biases to record against our project
os.environ["WANDB_PROJECT"] = PROJECT_NAME
os.environ["WANDB_LOG_MODEL"] = "checkpoint" if LOG_TO_WANDB else "end"
os.environ["WANDB_WATCH"] = "gradients"

In [39]:
dataset = load_dataset(DATASET_NAME)
train = dataset['train']
test = dataset['test']
train = train.select(range(0, 250))
# test = test.select(range(250, 300))

In [40]:
len(train)

250

In [41]:
len(test)

2000

In [14]:
if LOG_TO_WANDB:
  wandb.init(project=PROJECT_NAME, name=RUN_NAME)

## Quantization

In [15]:
if QUANT_4_BIT:
  quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
  )
else:
  quant_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.bfloat16
  )

In [16]:
## Load the Tokenizer and the Model
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config = quant_config,
    device_map = "auto",
)
base_model.generation_config.pad_token_id = tokenizer.pad_token_id

print(f"Memory footprint: {base_model.get_memory_footprint() / 1e6:.1f} MB")

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

Memory footprint: 5591.5 MB


# Data Collator

* It's important that we ensure during Training that we are not trying to train the model to predict the description of products; only their price.

* We need to tell the trainer that everything up to "Price is $" is there to give context to the model to predict the next token, but does not need to be learned.

* The trainer needs to teach the model to predict the token(s) after "Price is $".

* There is a complicated way to do this by setting Masks, but luckily HuggingFace provides a super simple helper class to take care of this for us.

In [42]:
train[0]

{'text': 'How much does this cost to the nearest dollar?\n\nand Replacement Range Cooktop Drip Pans fit GE, Hotpoint - Two 6 Inch and Two 8 Inch Pans (4 pieces)\nContents 2 x (6 inches) and 2 x (8 inches) bowls, 4 drip bowls total Compatibility This replacement kit works with GE, Hotpoint, Moffat, Monogram (GE), Profile (GE), RCA (GE), and Roper models prior to 1996. replaces 65975, replaces and 65974, 770169 Premium quality Drip bowls are made of durable high-quality material. It features a chrome finish, well-tested by the manufacturer. Durable, stick-free, easy to clean, and dishwasher safe. Ensure long-lasting and effective performance Easy to install Shut off electrical power, tilt the coil\n\nPrice is $12.00',
 'price': 11.99}

In [43]:
test[0]

{'text': "How much does this cost to the nearest dollar?\n\nSetpower Insulated Protective Cover for AJ30 Portable Refrigerator Freezer, suitable for AJ30 Only\nInsulation & Waterproof well-made insulation could save battery power and improve cooling efficiency by preventing cold air from flowing away. Durable and Foldable with its oxford cloth outer layer, it's durable and protects your portable refrigerator from scratches and dust. Expanded Bag for Accessories two expanded bags on its side, expand space to store the other accessories. Great Ventilation a hollowed design for positions of vents doesn't affect the ventilation. Attention this insulated cover is ONLY suitable for SetPower AJ30 portable refrigerator. FIT TO AJ30 ONLY. Brand Name Setpower, Model Info AJ30 COVER, model number AJ30 COVER, Installation Type Freestanding, Part AJ30 cover, Special Features Portable, Color\n\nPrice is $",
 'price': 65.99}

In [44]:
from trl import DataCollatorForCompletionOnlyLM
response_template = "Price is $"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

## Configuration for Training

Two objects need to be create:

* A LoraConfig object with our hyperparameters for LoRA

* An SFTConfig with our overall Training parameters

In [45]:
## Specify the configuration parameters for LoRA
lora_parameters = LoraConfig(
    lora_alpha = LORA_ALPHA,
    lora_dropout = LORA_DROPOUT,
    r = LORA_R,
    bias = "none",
    task_type = "CAUSAL_LM",
    target_modules = TARGET_MODULES,
)

## Specify the general configuration parameters for training
train_parameters = SFTConfig(
    output_dir = PROJECT_RUN_NAME,
    num_train_epochs = EPOCHS,
    per_device_train_batch_size = BATCH_SIZE,
    per_device_eval_batch_size = 1,
    eval_strategy = "no",
    gradient_accumulation_steps = GRADIENT_ACCUMULATION_STEPS,
    optim = OPTIMIZER,
    save_steps = SAVE_STEPS,
    save_total_limit = 10,
    logging_steps = STEPS,
    learning_rate = LEARNING_RATE,
    weight_decay = 0.001,
    fp16 = False,
    bf16 = True,
    max_grad_norm = 0.3,
    max_steps = -1,
    warmup_ratio = WARMUP_RATIO,
    group_by_length = True,
    lr_scheduler_type = LR_SCHEDULER_TYPE,
    report_to = "wandb" if LOG_TO_WANDB else None,
    run_name = RUN_NAME,
    max_seq_length = MAX_SEQUENCE_LENGTH,
    dataset_text_field = "text",
    save_strategy = "steps",
    hub_strategy = "every_save",
    push_to_hub = True,
    hub_model_id = HUB_MODEL_NAME,
    hub_private_repo = True
)

## Supervised Fine Tuning
fine_tuning = SFTTrainer(
    model = base_model,
    train_dataset = train,
    peft_config = lora_parameters,
    args = train_parameters,
    data_collator = collator
  )

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [46]:
## Fine-tuned the Model
fine_tuning.train()



Step,Training Loss
50,1.5977


[34m[1mwandb[0m: Adding directory to artifact (thilina-llama-3.1-tuned-2025-09-11_07.10.37/checkpoint-63)... Done. 4.7s


TrainOutput(global_step=63, training_loss=1.5548209538535467, metrics={'train_runtime': 668.5669, 'train_samples_per_second': 0.374, 'train_steps_per_second': 0.094, 'total_flos': 2002660041867264.0, 'train_loss': 1.5548209538535467, 'epoch': 1.0})

In [47]:
## Push the fine-tuned model to Hugging Face
fine_tuning.model.push_to_hub(PROJECT_RUN_NAME, private=True)
print(f"Saved to the hub: {PROJECT_RUN_NAME}")

README.md:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ..._07.10.37/adapter_model.safetensors: 100%|##########| 27.3MB / 27.3MB            

No files have been modified since last commit. Skipping to prevent empty commit.


Saved to the hub: thilina-llama-3.1-tuned-2025-09-11_07.10.37


In [48]:
if LOG_TO_WANDB:
  wandb.finish()

0,1
train/epoch,▁█
train/global_step,▁█
train/grad_norm,▁
train/learning_rate,▁
train/loss,▁

0,1
total_flos,2002660041867264.0
train/epoch,1.0
train/global_step,63.0
train/grad_norm,4.88342
train/learning_rate,1e-05
train/loss,1.5977
train_loss,1.55482
train_runtime,668.5669
train_samples_per_second,0.374
train_steps_per_second,0.094
