# Fine-tuning Gemma 2B on Google Colab

This notebook is designed to fine-tune the Gemma 2B Instruct model using LoRA (Low-Rank Adaptation) on Google Colab. It uses the Unsloth library for efficient training. This lighter model reduces training time and resource requirements compared to larger models.

## Prerequisites:
- Google Colab account with GPU enabled (Runtime > Change runtime type > T4 GPU)
- Hugging Face account and token
- Your dataset in JSON format

## Steps:
1. Upload your dataset.json to /content/ in Colab
2. Set your Hugging Face username and dataset name
3. Login to Hugging Face
4. Install required packages
5. Run the cells sequentially

## Important Notes:
- Gemma 2B is much lighter than Llama3 8B, requiring less GPU memory and training time.
- Training should complete in 10-30 minutes on a T4 GPU with 3 epochs.
- Ensure your dataset is in the correct format: list of dicts with 'instruction', 'input', 'output' keys.
- Monitor GPU usage; this model should fit comfortably in Colab's free tier.

In [None]:
# Install required packages
!pip install unsloth
!pip install transformers datasets accelerate peft trl bitsandbytes
!pip install huggingface_hub

# Import libraries
import torch
import json
import os
import pandas as pd
from datasets import Dataset, DatasetDict
from unsloth import FastLanguageModel
from transformers import TrainingArguments
from trl import SFTTrainer
from huggingface_hub import login
from datasets import load_dataset

# Login to Hugging Face
# Replace 'your_hf_token' with your actual token
login("your_hf_token")

In [None]:
# Set your configurations
huggingface_user = "FieryXcalibur"  # Your HF username
dataset_name = "your_dataset_name"  # Name for your dataset on HF
hf_token = "your_hf_token"  # Your HF token

# Login to HF
login(hf_token)

In [None]:
# Check GPU availability
!nvidia-smi
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

In [None]:
huggingface_user = "FieryXcalibur"
dataset_name = "your_dataset_name"  # Set your dataset name

class GemmaInstructDataset:
    def __init__(self, data):
        self.data = data
        self.prompts = []
        self.create_prompts()

    def create_prompt(self, row):
        prompt = f"""<start_of_turn>user
{row['instruction']}
{row['input']}<end_of_turn>
<start_of_turn>model
{row['output']}<end_of_turn>"""
        return prompt

    def create_prompts(self):
        for row in self.data:
            prompt = self.create_prompt(row)
            self.prompts.append(prompt)

    def get_dataset(self):
        df = pd.DataFrame({'prompt': self.prompts})
        return df

def create_dataset_hf(dataset):
    dataset.reset_index(drop=True, inplace=True)
    return DatasetDict({"train": Dataset.from_pandas(dataset)})

# Load and process dataset
with open('/content/dataset.json', 'r') as f:
    data = json.load(f)

dataset = GemmaInstructDataset(data)
df = dataset.get_dataset()

processed_data_path = 'processed_data'
os.makedirs(processed_data_path, exist_ok=True)

gemma_dataset = create_dataset_hf(df)
gemma_dataset.save_to_disk(os.path.join(processed_data_path, "gemma_dataset"))
gemma_dataset.push_to_hub(f"{huggingface_user}/{dataset_name}")

# **Step 5.** LoRa Finetuning Configurations
- "finetuned_model" sets your models name on HF
- "num_train_epochs" sets the number of epochs for training

    (epoch = 1 pass through your entire dataset)

In [None]:
# Defining the configuration for the base model, LoRA and training
config = {
    "hugging_face_username":huggingface_user,
    "model_config": {
        "base_model":"unsloth/gemma-2b-it-bnb-4bit", # Lighter 2B parameter model for faster training
        "finetuned_model":f"{huggingface_user}/gemma-2b-accessassist-finetuned", # The finetuned model
        "max_seq_length": 1024, # Reduced sequence length for faster training
        "dtype":torch.float16, # The data type
        "load_in_4bit": True, # Load the model in 4-bit
    },
    "lora_config": {
      "r": 8, # Reduced LoRA rank for faster training
      "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"], # The target modules
      "lora_alpha":8, # Adjusted alpha
      "lora_dropout":0, # The dropout value for LoRA
      "bias":"none", # The bias for LoRA
      "use_gradient_checkpointing":True, # Use gradient checkpointing
      "use_rslora":False, # Use RSLora
      "use_dora":False, # Use DoRa
      "loftq_config":None # The LoFTQ configuration
    },
    "training_dataset":{
        "name":f"{huggingface_user}/{dataset_name}", # The dataset name(huggingface/datasets)
        "split":"train", # The dataset split
        "input_field":"prompt", # The input field
    },
    "training_config": {
        "per_device_train_batch_size": 4, # Increased batch size for smaller model
        "gradient_accumulation_steps": 2, # Reduced accumulation steps
        "warmup_steps": 5, # The warmup steps
        "max_steps":0, # The maximum steps (0 if the epochs are defined)
        "num_train_epochs": 3, # Reduced epochs for faster training
        "learning_rate": 2e-4, # The learning rate
        "fp16": not torch.cuda.is_bf16_supported(),  # The fp16
        "bf16": torch.cuda.is_bf16_supported(), # The bf16
        "logging_steps": 1, # The logging steps
        "optim" :"adamw_8bit", # The optimizer
        "weight_decay" : 0.01,  # The weight decay
        "lr_scheduler_type": "linear", # The learning rate scheduler
        "seed" : 42, # The seed
        "output_dir" : "outputs", # The output directory
    }
}

# **Step 6.** Load Llama3-8B, QLoRA & Trainer Model

In [None]:
# Loading the model and the tokinizer for the model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = config.get("model_config").get("base_model"),
    max_seq_length = config.get("model_config").get("max_seq_length"),
    dtype = config.get("model_config").get("dtype"),
    load_in_4bit = config.get("model_config").get("load_in_4bit"),
)

# Setup for QLoRA/LoRA peft of the base model
model = FastLanguageModel.get_peft_model(
    model,
    r = config.get("lora_config").get("r"),
    target_modules = config.get("lora_config").get("target_modules"),
    lora_alpha = config.get("lora_config").get("lora_alpha"),
    lora_dropout = config.get("lora_config").get("lora_dropout"),
    bias = config.get("lora_config").get("bias"),
    use_gradient_checkpointing = config.get("lora_config").get("use_gradient_checkpointing"),
    random_state = 42,
    use_rslora = config.get("lora_config").get("use_rslora"),
    use_dora = config.get("lora_config").get("use_dora"),
    loftq_config = config.get("lora_config").get("loftq_config"),
)

# Loading the training dataset
dataset_train = load_dataset(config.get("training_dataset").get("name"), split = config.get("training_dataset").get("split"))

# Setting up the trainer for the model
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset_train,
    dataset_text_field = config.get("training_dataset").get("input_field"),
    max_seq_length = config.get("model_config").get("max_seq_length"),
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = config.get("training_config").get("per_device_train_batch_size"),
        gradient_accumulation_steps = config.get("training_config").get("gradient_accumulation_steps"),
        warmup_steps = config.get("training_config").get("warmup_steps"),
        max_steps = config.get("training_config").get("max_steps"),
        num_train_epochs= config.get("training_config").get("num_train_epochs"),
        learning_rate = config.get("training_config").get("learning_rate"),
        fp16 = config.get("training_config").get("fp16"),
        bf16 = config.get("training_config").get("bf16"),
        logging_steps = config.get("training_config").get("logging_steps"),
        optim = config.get("training_config").get("optim"),
        weight_decay = config.get("training_config").get("weight_decay"),
        lr_scheduler_type = config.get("training_config").get("lr_scheduler_type"),
        seed = 42,
        output_dir = config.get("training_config").get("output_dir"),
    ),
)

# **Step 7.** Train Your Finetuned Model

In [None]:
trainer_stats = trainer.train()

# **Step 8.** Save Trainer Stats

In [None]:
with open("trainer_stats.json", "w") as f:
    json.dump(trainer_stats, f, indent=4)

# **Step 9.** Save Finetuned Model & Push to HF Hub

In [None]:
# Save the fine-tuned model
model.save_pretrained_merged(config.get("model_config").get("finetuned_model"), tokenizer, save_method="merged_16bit")
model.push_to_hub_merged(config.get("model_config").get("finetuned_model"), tokenizer, save_method="merged_16bit")

# Optional: Save as GGUF for local inference
# model.save_pretrained_gguf(config.get("model_config").get("finetuned_model"), tokenizer, quantization_method="q4_k_m")
# model.push_to_hub_gguf(config.get("model_config").get("finetuned_model"), tokenizer, quantization_method="q4_k_m")

# **Step 10.** Test your pretrained model in Colab

In [None]:
# Loading the fine-tuned model and the tokenizer for inference
model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = config.get("model_config").get("finetuned_model"),
        max_seq_length = config.get("model_config").get("max_seq_length"),
        dtype = config.get("model_config").get("dtype"),
        load_in_4bit = config.get("model_config").get("load_in_4bit"),
    )

# Using FastLanguageModel for fast inference
FastLanguageModel.for_inference(model)

system_prompt = "You are an AI task automator. You will take a users prompt and use first principle reasoning to break the prompt into tasks that you must accomplish within another chat. RESPOND TO THIS MESSAGE ONLY WITH A PYTHON FORMATTED LIST OF TASKS THAT YOU MUST COMPLETE TO TRUTHFULLY AND INTELLIGENTLY ACCOMPLISH THE USERS REQUEST. ASSUME YOU CAN SEARCH THE WEB, WRITE CODE, RUN CODE, DEBUG CODE, AND AUTOMATE ANYTHING ON THE USERS COMPUTER TO ACCOMPLISH THE PROMPT. CORRECT RESPONSE FORMAT: ['task 1', 'task 2', 'task 3']"

# Tokenizing the input and generating the output
prompt = input('TYPE PROMPT TO GEMMA: ')
inputs = tokenizer(
[
    f"<start_of_turn>user\n{system_prompt}\n{prompt}<end_of_turn>\n<start_of_turn>model\n"
], return_tensors = "pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens = 256, use_cache = True)
tokenizer.batch_decode(outputs, skip_special_tokens = True)