In [1]:
%%capture
%pip install -U bitsandbytes
%pip install -U transformers
%pip install -U accelerate
%pip install -U peft
%pip install -U trl
%pip install kagglehub[hf-datasets]

In [4]:
!pip install ipywidgets

Collecting ipywidgets
  Downloading ipywidgets-8.1.7-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.8/139.8 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting jupyterlab_widgets~=3.0.15
  Downloading jupyterlab_widgets-3.0.15-py3-none-any.whl (216 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m216.6/216.6 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
Collecting widgetsnbextension~=4.0.14
  Downloading widgetsnbextension-4.0.14-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: widgetsnbextension, jupyterlab_widgets, ipywidgets
Successfully installed ipywidgets-8.1.7 jupyterlab_widgets-3.0.15 widgetsnbextension-4.0.14
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[3

In [5]:
from huggingface_hub import login
login(new_session=False)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### 1. Install relevant libraries

Install the libraries to fine-tune the task.

In [6]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from trl import setup_chat_format
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging)
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)
from sklearn.model_selection import train_test_split

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers and GPU quantization are unavailable.
E0000 00:00:1751648882.490982      10 common_lib.cc:612] Could not set metric server port: INVALID_ARGUMENT: Could not find SliceBuilder port 8471 in any of the 0 ports provided in `tpu_process_addresses`="local"
=== Source Location Trace: === 
learning/45eac/tfrc/runtime/common_lib.cc:230


In [7]:
model_id = "google/gemma-2-2b"

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)

In [9]:
# This tokenizer is aware of the start of the turn and the end of the turn
text = "<start_of_turn><end_of_turn>"
tokens = tokenizer.tokenize(text)
print(tokens)

['<start_of_turn>', '<end_of_turn>']


In [10]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split

# load csv file as a pandas dataframe 
# then split the dataset 
# Load CSV file into a Pandas DataFrame
df = pd.read_csv("/kaggle/input/explore-recipe-nlg-dataset/recipe_df.csv")

# Select the first 10,000 rows
df_subset = df.iloc[:10000].copy()

# shuffle and split: 2000 samples for test, rest for train
train_df, test_df = train_test_split(df_subset, test_size=2000, random_state=42, shuffle=True)

# reset index
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# convert to the huggingface dataset
train_ds = Dataset.from_pandas(train_df, split="train")
test_ds = Dataset.from_pandas(test_df, split="test")

### 2. Make a prompt template 

Apply the prompt template to the dataset

Then remove the unnecessary columns from the dataset

In [12]:
import ast

def generate_prompt(data_point):
    try:
        ner = ast.literal_eval(data_point["NER"]) if isinstance(data_point["NER"], str) else data_point["NER"]
        ingredients = ast.literal_eval(data_point["ingredients"]) if isinstance(data_point["ingredients"], str) else data_point["ingredients"]
        directions = ast.literal_eval(data_point["directions"]) if isinstance(data_point["directions"], str) else data_point["directions"]
    except Exception as e:
        print("Parsing error:", e)
        ner, ingredients, directions = [], [], []

    input_ingredients = "\n".join(ner)
    ingredients_text = "\n".join(ingredients)
    directions_text = "\n".join(directions)

    prompt = (
        "<start_of_turn>user\n"
        "Below is a list of ingredients. Write a complete recipe using them.\n\n"
        f"{input_ingredients}\n"
        "<end_of_turn>\n"
    )

    completion = (
        "<start_of_turn>model\n"
        f"Title: {data_point['title']}\n"
        f"Ingredients:\n{ingredients_text}\n"
        f"Directions:\n{directions_text}\n"
        "<end_of_turn>"
    )

    return {"prompt": prompt, "completion": completion}

In [13]:
# convert to the correct prompt
# Apply to both datasets
train_dataset = train_ds.map(generate_prompt)
test_dataset = test_ds.map(generate_prompt)

Map: 100%|██████████| 8000/8000 [00:01<00:00, 5047.91 examples/s]
Map: 100%|██████████| 2000/2000 [00:00<00:00, 5215.97 examples/s]


In [14]:
print(test_dataset)
print(train_dataset)

Dataset({
    features: ['Unnamed: 0', 'title', 'ingredients', 'directions', 'link', 'source', 'NER', 'prompt', 'completion'],
    num_rows: 2000
})
Dataset({
    features: ['Unnamed: 0', 'title', 'ingredients', 'directions', 'link', 'source', 'NER', 'prompt', 'completion'],
    num_rows: 8000
})


In [15]:
train_dataset["prompt"][0]

'<start_of_turn>user\nBelow is a list of ingredients. Write a complete recipe using them.\n\nwhite bread\ncream cheese\nvanilla\nmilk\nbutter\nsugar\ncinnamon\n<end_of_turn>\n'

### 3. Load the model for training 

Load the model for finetuning, and get the model quantized.

In [16]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")

Fetching 3 files: 100%|██████████| 3/3 [00:09<00:00,  3.30s/it]
Loading checkpoint shards: 100%|██████████| 3/3 [01:26<00:00, 28.98s/it]


### 4.Step 4 - Apply Lora(Low-Rank Adaptation)

It's a parameter-efficient fine-tuning technique that reduces the number of trainable parameters during the fine-tuning process. 

Instead of updating all the weights of the original model, LoRA adds small, trainable matrices (low-rank matrices) to the existing model layers.

In [17]:
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [18]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    # find the modules that can be applied low-rank matrices,
    # linear layers weight matrix can be divided into row and column matrix low-rank matices for adaptation.
    cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
          names = name.split('.')
          lora_module_names.add(names[0] if len(names) == 1 else names[-1])
        if 'lm_head' in lora_module_names: # needed for 16-bit
          lora_module_names.remove('lm_head')
    
    return list(lora_module_names)

In [19]:
modules = find_all_linear_names(model)
print(modules)

['o_proj', 'gate_proj', 'down_proj', 'k_proj', 'v_proj', 'q_proj', 'up_proj']


In [20]:
# add the peft to the found layers of transformer model
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=64,
    lora_alpha=32,
    target_modules=modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

In [21]:
print(next(model.parameters()).device)

cpu


In [22]:
# find the trainable parameters 
trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")

Trainable: 83066880 | total: 2697408768 | Percentage: 3.0795%


### Train the model using the SFT trainer

From the dataset for the training purpose it will consider only the "prompt"

dataset_text_field="prompt",

In [None]:
import transformers
from trl import SFTConfig
from trl import SFTTrainer
from transformers import DataCollatorForLanguageModeling


tokenizer.pad_token = tokenizer.eos_token
torch.cuda.empty_cache()

# add sft config instead of training arguments
sft_config = SFTConfig(
    output_dir="outputs",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    max_steps=10,  # set steps num of epochs
    learning_rate=2e-4,
    logging_steps=1,
    save_strategy="epoch",
    optim="paged_adamw_8bit",  # If using bitsandbytes or quantization
    dataset_text_field="prompt",
    no_cuda=False,
    label_names=["labels"] 
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    peft_config=lora_config, 
    args=sft_config,          
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

E0000 00:00:1751649178.125011      10 common_lib.cc:621] Could not set metric server port: INVALID_ARGUMENT: Could not find SliceBuilder port 8471 in any of the 0 ports provided in `tpu_process_addresses`="local"
=== Source Location Trace: === 
learning/45eac/tfrc/runtime/common_lib.cc:232
Adding EOS to train dataset: 100%|██████████| 8000/8000 [00:00<00:00, 8180.65 examples/s]
Tokenizing train dataset: 100%|██████████| 8000/8000 [00:08<00:00, 984.76 examples/s] 
Truncating train dataset: 100%|██████████| 8000/8000 [00:00<00:00, 164220.08 examples/s]


In [None]:
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

In [None]:
new_model = "gemma2-Recipe-Instruct-Finetune"