# Finetuning Using Google Gemma's Model

In [1]:
!pip3 install -q -U bitsandbytes==0.42.0
!pip3 install -q -U peft==0.8.2
!pip3 install -q -U trl==0.7.10
!pip3 install -q -U accelerate==0.27.1
!pip3 install -q -U datasets==2.17.0
!pip3 install -q -U transformers==4.38.0

In [2]:
import os
import transformers
import torch
from google.colab import userdata
from datasets import load_dataset
from trl import SFTTrainer
from peft import LoraConfig
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig, GemmaTokenizer
import pandas as pd

In [3]:
os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')

### Prerequisites
* nf4(4-bit NormalFloat(NF4)) : https://www.kaggle.com/code/lorentzyeung/what-s-4-bit-quantization-how-does-it-help-llama2


In [4]:
model_id = "google/gemma-2b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ['HF_TOKEN'])
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=bnb_config,
                                             device_map={"":0},
                                             token=os.environ['HF_TOKEN'])

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
#Code block added by group:18
SolutionParts = [] # this contains the solution in parts as per finetuned suggestion by gemma 2b.


In [7]:
text = "Programming Help Question for a beginner learner in c++ (providing code is PROHIBITED): Write a program in C++ to print the first 5 numbers on a new line. explain your logic step by step"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=2000)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Programming Help Question for a beginner learner in c++ (providing code is PROHIBITED): Write a program in C++ to print the first 5 numbers on a new line. explain your logic step by step.

Answer:

Step 1/5
1. Declare a variable to store the first 5 numbers.

Step 2/5
2. Initialize the variable to 0.

Step 3/5
3. Print the first number.

Step 4/5
4. Increment the variable by 1.

Step 5/5
5. Repeat steps 3-4 for the remaining 4 numbers.


In [8]:
os.environ["WANDB_DISABLED"] = "false"

In [9]:
lora_config = LoraConfig(
    r = 8,
    target_modules = ["q_proj", "o_proj", "k_proj", "v_proj",
                      "gate_proj", "up_proj", "down_proj"],
    task_type = "CAUSAL_LM",
)

In [62]:
def formatting_func(example):
    text = f"Programming Help Question for a beginner learner in c++ (providing code is PROHIBITED): {example['Programming Help Question for a beginner learner in c++ (providing code is PROHIBITED)'][0]}\nParts: {example['Parts'][0]}"
    return [text]

In [43]:
#from sklearn.model_selection import train_test_split
#train_data, temp_data = train_test_split(data['train'], train_size=0.3, random_state=42)

In [63]:
#group 18 changed it for testing purposes
VTA_DF = pd.read_csv("SampleFinetuning.csv")
VTA_DF = VTA_DF[['Problem', 'Parts']]
VTA_DF = VTA_DF.dropna(subset=['Parts'])
VTA_DF = VTA_DF.reset_index(drop=True)
Parts = list(VTA_DF['Parts'])
Problem = list(VTA_DF['Problem'])
data = {}
data['Programming Help Question for a beginner learner in c++ (providing code is PROHIBITED)'] = Problem
data['Parts'] = Parts

from datasets import Dataset
# Step 2: Populate the dataset
custom_dataset = Dataset.from_dict(data)
# Step 3: Register the dataset
custom_dataset = custom_dataset.map(lambda example: {"features": {"Programming Help Question for a beginner learner in c++ (providing code is PROHIBITED)": example["Programming Help Question for a beginner learner in c++ (providing code is PROHIBITED)"], "Parts": example["Parts"]}})
custom_dataset.set_format(type="torch", columns=["features"])

Map:   0%|          | 0/31 [00:00<?, ? examples/s]

In [64]:
trainer = SFTTrainer(
    model=model,
    train_dataset=custom_dataset,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=100,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    peft_config=lora_config,
    formatting_func=formatting_func,
)



Map:   0%|          | 0/31 [00:00<?, ? examples/s]



In [65]:
trainer.train()

Step,Training Loss
1,0.6189
2,0.6189
3,0.6055
4,0.5754
5,0.5436
6,0.509
7,0.4711
8,0.43
9,0.3858
10,0.3379


TrainOutput(global_step=100, training_loss=0.06777165361569132, metrics={'train_runtime': 47.219, 'train_samples_per_second': 8.471, 'train_steps_per_second': 2.118, 'total_flos': 169692008448000.0, 'train_loss': 0.06777165361569132, 'epoch': 100.0})

In [67]:
UserInputText = "Write a program to print the difference of two numbers entered by user by defining your own function."
text = "Programming Help Question for a beginner learner in c++ (providing code is PROHIBITED): " + UserInputText +  " explain your logic step by step"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=300)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Programming Help Question for a beginner learner in c++ (providing code is PROHIBITED): Write a program to print the difference of two numbers entered by user by defining your own function.. explain your logic step by step, provide your code, and provide a complete solution. int main() {  // Loop until a valid number is entered  int firstNumber = 0,  // Initializing the loop  loop:  cout << "Enter a number: ";  // Print the current number  cin >> firstNumber;  // Collect a new number from the user  while (cin.fail() || firstNumber < 0) {    // Loop continues as long as cin is in failure or firstNumber is negative  }  // Process and accumulate a new number  int secondNumber = 0;  // Initializing the loop, same as in loop
Programming Help Question for a beginner learner in c++ (providing code is PROHIBITED): Write a program in C++ to print the first 5 numbers on a new line
Parts: 1)Iterate through the numbers, accumulate their sum, and print each number 2) Initializing the Loop:  A for l