In [None]:
# Fine-tuning Gemma Model with Transformers and custom local dataset
# This script is designed to run in a Google Colab environment.

# !pip uninstall bitsandbytes peft trl accelerate transformers datasets
# The above line is commented out, but it's often used to ensure a clean environment
# before installing specific versions of libraries, especially in environments like Colab.

# Install necessary libraries for fine-tuning
!pip3 install bitsandbytes # For 8-bit or 4-bit quantization, reducing memory usage
!pip3 install peft # Parameter-Efficient Fine-Tuning library (e.g., LoRA)
!pip3 install trl # Transformer Reinforcement Learning, useful for SFT (Supervised Fine-Tuning)
!pip3 install accelerate # Speeds up model training on different hardware setups
!pip3 install datasets # Hugging Face's library for easily loading and processing datasets
!pip3 install transformers # Core Hugging Face library for pre-trained models and training tools

In [None]:
import os # For interacting with the operating system, like setting environment variables
import transformers # Hugging Face Transformers library
import torch # PyTorch library, essential for deep learning operations
from datasets import load_dataset, Dataset # For handling datasets
from trl import SFTTrainer # Trainer specifically designed for Supervised Fine-Tuning
from peft import LoraConfig, PeftModel, PeftConfig, get_peft_model # PEFT configurations and utilities
from transformers import AutoTokenizer, AutoModelForCausalLM # For loading tokenizer and language model
from transformers import BitsAndBytesConfig, GemmaTokenizer # Specific configurations for quantization and Gemma tokenizer

In [None]:
from huggingface_hub import notebook_login
notebook_login() # Logs into Hugging Face Hub, required for downloading gated models, provide your api_key from hugging face

In [None]:
# Define the model ID for the Gemma 3-1B instruction-tuned model
model_id = "google/gemma-3-1b-it"

# Configure BitsAndBytes for 4-bit quantization
# This significantly reduces memory footprint, allowing larger models or batch sizes
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, # Load the model weights in 4-bit precision
    bnb_4bit_quant_type="nf4", # Specifies the 4-bit quantization type (NormalFloat 4-bit)
    bnb_4bit_compute_dtype=torch.bfloat16 # The data type used for computation during fine-tuning
)

In [None]:
# Load the tokenizer for the specified model
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Load the pre-trained Causal Language Model with quantization configuration
# device_map="auto" automatically distributes the model across available devices (e.g., GPUs)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config,
                                                device_map="auto")

In [None]:
# Disable Weights & Biases (WandB) logging if not needed.
# If you want to track experiments, set this to "true" or remove the line.
os.environ["WANDB_DISABLED"] = "false"

In [None]:
# Configure LoRA (Low-Rank Adaptation) for Parameter-Efficient Fine-Tuning
lora_config = LoraConfig(
    r=8, # Rank of the update matrices. A smaller 'r' means fewer trainable parameters.
    # Specifies which modules (layers) in the model will have LoRA applied.
    # These are typically attention and feed-forward layers in transformer models.
    target_modules={"q_proj", "o_proj", "k_proj", "v_proj",
                    "gate_proj", "up_proj", "down_proj"},
    task_type="CAUSAL_LM", # Defines the task type as Causal Language Modeling
)

In [None]:
from datasets import Dataset # Import Dataset class
import json # For handling JSON data

# Read the JSONL (JSON Lines) file manually
# This is a common format for datasets where each line is a separate JSON object.
file_path = "<user_dataset>.jsonl" # Placeholder for your dataset file path
data = []

with open(file_path, "r") as f:
    for line in f:
        data.append(json.loads(line.strip())) # Parse each line as a JSON object and add to list

# Convert the list of dictionaries into a HuggingFace Dataset object
datasett = Dataset.from_list(data)

# Optional: Check a sample from the dataset to verify content
print(datasett[0])

In [None]:
# Define a function to format the dataset examples for Supervised Fine-Tuning (SFT)
# The SFTTrainer expects a 'text' column containing the formatted input for the model.
def format_for_sft(example):
    # This format creates a clear instruction-response pair for the model to learn from.
    return {
        "text": f"Instruction:{example['instruction']}\nResponse:{example['context']}"
    }

# Apply the formatting function to the dataset
formatted_dataset = datasett.map(format_for_sft)

In [None]:
# Initialize the SFTTrainer for fine-tuning the model
trainer = SFTTrainer(
    model=model, # The model to be fine-tuned
    processing_class=tokenizer, # The tokenizer to use for processing input text
    train_dataset=formatted_dataset, # The formatted dataset for training
    args=transformers.TrainingArguments( # Configuration for the training process
        per_device_train_batch_size=8, # Number of samples per batch per device
        gradient_accumulation_steps=16, # Accumulate gradients over multiple steps to simulate a larger batch size
        warmup_steps=2, # Number of steps for learning rate warmup
        num_train_epochs=1, # Number of full passes over the training data
        # max_steps=300, # Alternative to num_train_epochs, for training for a fixed number of steps
        learning_rate=2e-4, # Initial learning rate
        fp16=True, # Enable mixed-precision training (float16) for faster training and less memory
        logging_steps=1, # Log metrics every 1 step
        output_dir="<output_save_dir>", # Directory to save checkpoints and logs
        optim="paged_adamw_8bit", # Optimizer to use (AdamW with 8-bit paging for memory efficiency)
    ),
    peft_config=lora_config, # Apply the LoRA configuration during training
)

In [None]:
# Start the training process
trainer.train()

In [None]:
# Save the fine-tuned PEFT (LoRA) model weights and tokenizer
trainer.model.save_pretrained("<output_saved_dir>")     # Saves the LoRA adapters
trainer.tokenizer.save_pretrained("<output_saved_dir>") # Saves the tokenizer

In [None]:
from peft import PeftModel, PeftConfig, get_peft_model # Re-importing specific PEFT components for clarity
from transformers import AutoModelForCausalLM # Re-importing for clarity

# Load the PEFT configuration from the saved directory
peft_model_id = "<output_saved_dir>" # The directory where your LoRA adapters were saved
config = PeftConfig.from_pretrained(peft_model_id)

# Load the base model again (without quantization this time, as we will merge)
# config.base_model_name_or_path holds the original model_id
base_model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
# Load the PEFT model by applying the LoRA adapters on top of the base model
model = PeftModel.from_pretrained(base_model, peft_model_id)

# Merge the LoRA weights into the base model.
# This creates a single, consolidated model that can be used for inference
# without needing the PEFT library.
merged_model = model.merge_and_unload()

# Save the merged full model (base model + merged LoRA weights)
dir_to_save = "<merged_model_dir>"
merged_model.save_pretrained(dir_to_save) # Directory to save the fully merged model
tokenizer.save_pretrained(dir_to_save) # Save the tokenizer again with the merged model

print(f"✅ Merged full model saved at '{dir_to_save}'")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load the merged model and tokenizer for inference
model_id = "<merged_model_dir>" # Alternatively, you can use the "dir_to_save" variable from the previous step
tokenizer = AutoTokenizer.from_pretrained(model_id) # Load tokenizer from merged model directory
model = AutoModelForCausalLM.from_pretrained(model_id) # Load the merged model

In [None]:
# Move the model to CUDA (GPU) if available, otherwise to CPU
model = model.to("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Define a prompt for testing the fine-tuned model
prompt = "Instruction: What is the history of internet\nResponse:"
# Tokenize the prompt and move it to the appropriate device (CUDA/CPU)
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
print(inputs) # Print the tokenized inputs

In [None]:
# Generate a response using the fine-tuned model
outputs = model.generate(**inputs, max_new_tokens=50) # Generate up to 50 new tokens
# Decode the generated tokens back into human-readable text
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

##### <b><i>The below part only for those who want to quantize the model.</i></b>

In [None]:
# Steps to convert the Hugging Face model to GGUF format using llama.cpp
# This is useful for running the model on a CPU or with llama.cpp's optimizations.

# ✅ Step 1: Go to the root directory in Colab's file system
%cd /content

# ✅ Step 2: Clone llama.cpp repository (fresh clone to ensure latest version)
!rm -rf llama.cpp # Remove any existing llama.cpp directory
!git clone https://github.com/ggerganov/llama.cpp.git # Clone the repository
%cd llama.cpp # Change directory into the cloned llama.cpp

# ✅ Step 3: Build llama.cpp with CMake
!mkdir -p build # Create a build directory
%cd build # Change into the build directory
!cmake .. # Configure the project with CMake
!make -j # Compile the project using all available CPU cores
%cd .. # Go back to the llama.cpp root directory

# ✅ Step 4: Convert merged Hugging Face model to GGUF format (specific for Gemma)
# This script converts the Hugging Face model (which is in PyTorch format) to the GGUF format.
# Ensure that '/content/merged_model/' contains the saved merged model files (safetensors, config.json, etc.).
# Change the "merged_model" with your merged model directory.
!python3 convert_hf_to_gguf.py /content/merged_model --outfile /content/gemma3.gguf 

# ✅ Step 5: Quantize the GGUF model to 4-bit (q4_0 quantization)
# This further reduces the model size and memory usage, suitable for CPU inference.
# './build/bin/llama-quantize' is the executable compiled in Step 3.
!./build/bin/llama-quantize /content/gemma3.gguf /content/gemma3-q4.gguf q4_0 # Change gemma3-q4.gguf to your desired output file name

# ✅ Step 6: Verify the output by listing the file size of the quantized GGUF model
!ls -lh /content/gemma3-q4.gguf


In [None]:
# Now download the model from google colab file section
""" 
After running this notebook, you can find 'gemma3-q4.gguf' or 
the file with your given name in the Colab file browser 
"""
# (usually on the left sidebar) under the '/content/' directory. You can then download it.
# Thank You! 🙏