In [None]:
# Qwen2-VL Fine-tuning with LoRA on Google Colab

# This notebook fine-tunes the Qwen2-VL-2B-Instruct model using LoRA (Low-Rank Adaptation) on your first principles dataset.

# **Prerequisites:**
# - Enable GPU runtime: Runtime → Change runtime type → Hardware accelerator → GPU
# - Upload your dataset file to Colab or mount Google Drive


In [None]:
## 1. Setup and Installation


In [1]:
# Install required packages
!uv pip install -q transformers datasets accelerate peft trl bitsandbytes wandb torch torchvision
!uv pip install -q qwen-vl-utils

# Check GPU availability
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")


CUDA available: True
GPU: Tesla T4
GPU Memory: 15.8 GB


In [None]:
## 2. Authentication


In [2]:
# Login to Hugging Face and Weights & Biases
from huggingface_hub import login
import wandb

# HuggingFace login
login()

# WandB login
wandb.login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

[34m[1mwandb[0m: Currently logged in as: [33mkhushal-mandavia72[0m ([33mkhushal-mandavia72-none[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
## 3. Upload Dataset

# Upload your `first_principles_dataset.json` file using the file upload widget below, or mount Google Drive if your dataset is stored there.


In [3]:
from google.colab import files
import json
import os

# Option 1: Upload dataset file
print("Upload your first_principles_dataset.json file:")
uploaded = files.upload()

# Get the uploaded file name
dataset_file = list(uploaded.keys())[0]
print(f"Dataset uploaded: {dataset_file}")

# Verify dataset format
with open(dataset_file, 'r') as f:
    data = json.load(f)
    print(f"Dataset contains {len(data)} examples")
    print("Sample entry:", data[0])


Upload your first_principles_dataset.json file:


Saving structured_dataset.json to structured_dataset (1).json
Dataset uploaded: structured_dataset (1).json
Dataset contains 600 examples
Sample entry: {'messages': [{'role': 'system', 'content': 'You are an expert educator who explains concepts from first principles like Richard Feynman. Start with fundamental truths, use simple analogies, and avoid jargon. Use a storytelling tone and follow a step by step explanation style:'}, {'role': 'user', 'content': 'Why do objects fall to the ground when dropped?'}, {'role': 'assistant', 'content': "Okay, let’s imagine you have a stretched rubber sheet and you place a heavy ball in the middle. The sheet bends downwards, right? Now, if you roll a smaller ball nearby, it will start rolling toward the heavier ball because of the dip. This is a simple way to picture how gravity works. Gravity is like the Earth making a 'dip' in space that pulls things toward it.\n\nWhen you let go of an object, it falls because the Earth is pulling it toward its ce

In [None]:
# Option 2: Mount Google Drive (alternative to file upload)
# Uncomment the lines below if you prefer to use Google Drive

# from google.colab import drive
# drive.mount('/content/drive')
# dataset_file = '/content/drive/MyDrive/path/to/your/first_principles_dataset.json'


In [None]:
## 4. Model and Training Setup


In [4]:
from datasets import Dataset
from trl import SFTConfig, SFTTrainer
import torch
from transformers import (
    AutoTokenizer,
    BitsAndBytesConfig,
    Qwen2VLForConditionalGeneration,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import json

!pip show transformers


Name: transformers
Version: 4.52.4
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /usr/local/lib/python3.11/dist-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: peft, sentence-transformers, trl


In [5]:
# Configuration
model_name = "Qwen/Qwen2.5-1.5B-Instruct"
output_dir = "./Qwen2.5-1.5B-Instruct-Sft-results"
hub_model_id = "KhushalM/Qwen2.5-1.5BSFT"  # Change this to your desired model name

# Initialize WandB
wandb.init(
    project="qwen2.5-1.5b-sft-colab",
    config={
        "model": model_name,
        "dataset": dataset_file,
        "lora_r": 32,
        "batch_size": 1,
        "learning_rate": 2e-4,
        "platform": "Google Colab"
    }
)


In [6]:
# 4-bit quantization configuration for GPU memory efficiency
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

print("Quantization config created")


Quantization config created


In [7]:
# Load model with quantization
print("Loading model...")
from transformers import AutoTokenizer, AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)
model.to("cuda")
print("Model loaded successfully!")
print(f"Model device: {next(model.parameters()).device}")


Loading model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Model loaded successfully!
Model device: cuda:0


In [8]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
)
tokenizer.pad_token = tokenizer.eos_token
# Try Gemma's official chat template
# tokenizer.chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<start_of_turn>user\n' + message['content'] + '<end_of_turn>\n' }}{% elif message['role'] == 'assistant' %}{{ '<start_of_turn>model\n' + message['content'] + '<end_of_turn>\n' }}{% elif message['role'] == 'system' %}{{ '<start_of_turn>system\n' + message['content'] + '<end_of_turn>\n' }}{% endif %}{% endfor %}"
print("Tokenizer loaded successfully!")


Tokenizer loaded successfully!


In [9]:
# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)
model.config.use_cache = False
print("Model prepared for k-bit training")


Model prepared for k-bit training


In [10]:
# LoRA configuration
peft_config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA to model
model = get_peft_model(model, peft_config)
print("LoRA configuration applied")
print(f"Trainable parameters: {model.num_parameters(only_trainable=True):,}")
print(f"Total parameters: {model.num_parameters():,}")


LoRA configuration applied
Trainable parameters: 8,716,288
Total parameters: 1,552,430,592


In [None]:
## 5. Dataset Preparation


In [11]:
# Load and prepare dataset
with open(dataset_file, 'r') as f:
    dataset_json = json.load(f)

# Convert to HuggingFace Dataset
dataset = Dataset.from_list(dataset_json)
print(f"Dataset loaded with {len(dataset)} examples")
print("Sample entry:", dataset[0])

dataset = dataset.train_test_split(test_size=0.10, seed=42)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]
print(f"Train examples: {len(train_dataset)}")
print(f"Eval examples: {len(eval_dataset)}")


Dataset loaded with 600 examples
Sample entry: {'messages': [{'content': 'You are an expert educator who explains concepts from first principles like Richard Feynman. Start with fundamental truths, use simple analogies, and avoid jargon. Use a storytelling tone and follow a step by step explanation style:', 'role': 'system'}, {'content': 'Why do objects fall to the ground when dropped?', 'role': 'user'}, {'content': "Okay, let’s imagine you have a stretched rubber sheet and you place a heavy ball in the middle. The sheet bends downwards, right? Now, if you roll a smaller ball nearby, it will start rolling toward the heavier ball because of the dip. This is a simple way to picture how gravity works. Gravity is like the Earth making a 'dip' in space that pulls things toward it.\n\nWhen you let go of an object, it falls because the Earth is pulling it toward its center, similar to how the heavy ball makes the rubber sheet dip. This pull is what we call gravitational force. It's a force th

In [12]:
# Dataset formatting function
def format_dataset(sample):
    """Format the dataset for chat template"""
    # Set a default chat template if not already set
    #if tokenizer.chat_template is None:
        #tokenizer.chat_template = "{% for message in messages %}{{ message['role'] + '\n' + message['content'] + eos_token + '\n'}}{% endfor %}"

    return tokenizer.apply_chat_template(
        sample["messages"],
        tokenize=False,
        add_generation_prompt=False
    )

# Test the formatting function
sample_formatted = format_dataset(train_dataset[509])
print("Formatted sample (first 300 chars):")
print(sample_formatted[:1200] + "...")
print(f"Size of formatted sample: {len(sample_formatted)} characters") # Add this line

Formatted sample (first 300 chars):
<|im_start|>system
You are an expert educator who explains concepts from first principles like Richard Feynman. Start with fundamental truths, use simple analogies, and avoid jargon. Use a storytelling tone and follow a step by step explanation style:<|im_end|>
<|im_start|>user
Why do some sports players perform better under pressure?<|im_end|>
<|im_start|>assistant
Imagine you’re playing a game where the crowd is cheering loudly. Some players feel energized, like the noise gives them power, while others might feel nervous and freeze. Performing well under pressure depends on how a player’s brain handles stress. Some athletes train their minds to focus only on the task, blocking out distractions and fear. It’s like a mental muscle that helps them stay calm and use their skills even when stakes are high. Others might have more experience or confidence, which helps them trust themselves. So, it’s a mix of mental preparation, practice, and how the brain

In [None]:
## 6. Training Configuration and Training


In [None]:
# from transformers import TrainerCallback

# class CustomWandbCallback(TrainerCallback):
#     # Removed manual_eval_steps as we will rely on the trainer's eval_steps
#     def __init__(self):
#         super().__init__()
#         print("Callback initialized.")

#     # Removed on_init_end as _trainer is no longer needed
#     # def on_init_end(self, args, state, control, **kwargs):
#     #      pass

#     # Removed on_step_end as manual evaluation is removed
#     # def on_step_end(self, args, state, control, **kwargs):
#     #     pass

#     def on_log(self, args, state, control, logs=None, **kwargs):
#         # Log training metrics as before
#         if state.is_local_process_zero and logs:
#             wandb_logs = {
#                 "train/loss": logs.get("loss", None),
#                 "learning_rate": logs.get("learning_rate", None),
#                 "epoch": logs.get("epoch", None)
#             }
#             # Only log if there are actual training logs (not evaluation logs)
#             if logs.get("loss", None) is not None:
#                  wandb.log(wandb_logs, step=state.global_step)

#     # Add on_evaluate to explicitly log evaluation metrics if needed,
#     # but SFTTrainer should handle this when report_to="wandb" and eval_steps is set.
#     # This is kept as a fallback or for custom logging.
#     def on_evaluate(self, args, state, control, metrics=None, **kwargs):
#         if state.is_local_process_zero and metrics:
#              wandb_logs = {
#                 "eval/loss": metrics.get("eval_loss", None),
#                 "eval/runtime": metrics.get("eval_runtime", None),
#                 "eval/samples_per_second": metrics.get("eval_samples_per_second", None),
#                 "eval/steps_per_second": metrics.get("eval_steps_per_second", None),
#             }
#              wandb.log(wandb_logs, step=state.global_step)

In [13]:
# Training arguments
training_args = SFTConfig(
    output_dir=output_dir,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    learning_rate=2e-4,
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_ratio=0.1,
    report_to="wandb",
    save_strategy="epoch",
    save_total_limit=2,
    #eval_strategy = "epoch",
    #eval_steps=20,                # Evaluate every 50 steps
    logging_steps=10,             # Log every 10 steps
    save_only_model=True,
    log_on_each_node=True,
    push_to_hub=True,
    hub_model_id=hub_model_id,
    max_length=512,
    packing=True,
    bf16=True if torch.backends.mps.is_available() else torch.cuda.is_bf16_supported(),
    fp16=False,  # Disable on Apple Silicon
    optim="adamw_torch",
    dataloader_num_workers=2,
    label_names=[]
)


# Define eval_steps separately for the callback (manual evaluation trigger)
# Removed manual_eval_steps as we are removing manual evaluation trigger from callback
# manual_eval_steps = 50

# Create the callback instance *before* creating the trainer
# Removed manual_eval_steps argument
# wandb_callback = CustomWandbCallback()


print("SFTConfig configured")
print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")

average_tokens_across_devices is set to True but it is invalid when world size is1. Turn it to False automatically.


SFTConfig configured
Effective batch size: 4


In [14]:
# Add to trainer
# Try Gemma's official chat template
#tokenizer.chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<start_of_turn>user\n' + message['content'] + '<end_of_turn>\n' }}{% elif message['role'] == 'assistant' %}{{ '<start_of_turn>model\n' + message['content'] + '<end_of_turn>\n' }}{% elif message['role'] == 'system' %}{{ '<start_of_turn>system\n' + message['content'] + '<end_of_turn>\n' }}{% endif %}{% endfor %}"
trainer = SFTTrainer(
    model=model,
    # Pass sft_config directly to args if it's intended to work that way
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # Ensure eval_dataset is passed here
    peft_config=peft_config,
    # formatting_func and other SFTTrainer specific args are now in sft_config
    # callbacks=[wandb_callback], # Removed the custom callback
    formatting_func=format_dataset,
    processing_class=tokenizer,


)

# Removed setting _trainer explicitly as it's not needed in the modified callback
# wandb_callback._trainer = trainer


print("Trainer created successfully!")
print(f"Number of training examples: {len(trainer.train_dataset)}")
print(f"Number of evaluation examples: {len(trainer.eval_dataset)}")



Applying formatting function to train dataset:   0%|          | 0/540 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/540 [00:00<?, ? examples/s]

Packing train dataset:   0%|          | 0/540 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/60 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/60 [00:00<?, ? examples/s]

Packing eval dataset:   0%|          | 0/60 [00:00<?, ? examples/s]

Trainer created successfully!
Number of training examples: 242
Number of evaluation examples: 27


In [15]:
import torch
print(f"Allocated: {torch.cuda.memory_allocated()/1e9:.2f} GB")
print(f"Reserved : {torch.cuda.memory_reserved()/1e9:.2f} GB")


Allocated: 1.66 GB
Reserved : 2.78 GB


In [16]:
# Start training
print("Starting training...")
print("This may take 1-3 hours depending on your dataset size and GPU.")

trainer.train()

print("Training completed!")


Starting training...
This may take 1-3 hours depending on your dataset size and GPU.




Step,Training Loss
10,1.811
20,1.4601
30,1.1511
40,1.0806
50,1.0256
60,0.9949
70,0.9358
80,0.925
90,0.8993
100,0.9237




Training completed!


In [None]:
## 7. Save and Test the Model


In [17]:
# Save the final model
final_model_path = f"{output_dir}/final_model"
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)

print(f"Model saved to {final_model_path}")
print("Model files:")
!ls -la {final_model_path}


Model saved to ./Qwen2.5-1.5B-Instruct-Sft-results/final_model
Model files:
total 49632
drwxr-xr-x 2 root root     4096 Jun 27 15:23 .
drwxr-xr-x 5 root root     4096 Jun 27 15:22 ..
-rw-r--r-- 1 root root      810 Jun 27 15:23 adapter_config.json
-rw-r--r-- 1 root root 34895152 Jun 27 15:23 adapter_model.safetensors
-rw-r--r-- 1 root root      605 Jun 27 15:23 added_tokens.json
-rw-r--r-- 1 root root     2507 Jun 27 15:23 chat_template.jinja
-rw-r--r-- 1 root root  1671853 Jun 27 15:23 merges.txt
-rw-r--r-- 1 root root     5100 Jun 27 15:22 README.md
-rw-r--r-- 1 root root      496 Jun 27 15:23 special_tokens_map.json
-rw-r--r-- 1 root root     4683 Jun 27 15:23 tokenizer_config.json
-rw-r--r-- 1 root root 11421896 Jun 27 15:23 tokenizer.json
-rw-r--r-- 1 root root     5752 Jun 27 15:23 training_args.bin
-rw-r--r-- 1 root root  2776833 Jun 27 15:23 vocab.json


In [25]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

# Assuming quantization_config is already defined in a previous cell
# If not, you might need to add it here or ensure the previous cell is run

# Test the fine-tuned model
finetuned_model_path = f"{output_dir}/final_model" # Use the local saved model path
orginal_model_name = "Qwen/Qwen2.5-1.5B-Instruct"

# Load tokenizers separately for each model
print("Loading tokenizer for fine-tuned model...")
finetuned_tokenizer = AutoTokenizer.from_pretrained(
    finetuned_model_path,
    trust_remote_code=True,
)
finetuned_tokenizer.pad_token = finetuned_tokenizer.eos_token
# Ensure the chat template is set for the tokenizer used by the pipeline


print("Loading tokenizer for original model...")
original_tokenizer = AutoTokenizer.from_pretrained(
    orginal_model_name,
    trust_remote_code=True,
)
original_tokenizer.pad_token = original_tokenizer.eos_token
# Ensure the chat template is set for the tokenizer used by the pipeline


def test_model_with_pipeline(prompt, model_path, tokenizer, model_name, max_new_tokens=128):
    print(f"\nTesting model: {model_name}")

    # Determine if loading from local files
    #is_local = (model_path == finetuned_model_path)

    # Load the model using pipeline
    # Pass local_files_only as a direct argument when loading from a local path
    generator = pipeline(
        "text-generation",
        model=model_path,
        tokenizer=tokenizer, # Pass the tokenizer with the chat template
        # Removed device="cuda"
        model_kwargs={"quantization_config": quantization_config, "torch_dtype": torch.bfloat16}, # Removed trust_remote_code
        #local_files_only=is_local # Pass as a direct argument
    )

    messages = [
        {"role": "system", "content": "You are an expert educator who explains concepts from first principles like Richard Feynman. Start with fundamental truths, use simple analogies, and avoid jargon."},
        {"role": "user", "content": prompt}
    ]

    # The pipeline will automatically apply the chat template if set on the tokenizer
    output = generator(
        messages,
        max_new_tokens=max_new_tokens,
        return_full_text=False,
        pad_token_id=tokenizer.eos_token_id, # Set pad_token_id for generation
        do_sample=True, # Ensure sampling is enabled if temperature is set
        temperature=0.7,
    )

    if output and output[0] and "generated_text" in output[0]:
        # The pipeline with return_full_text=False returns only the new tokens
        # However, sometimes it might still include parts of the prompt depending on the model/tokenizer
        # Let's try to clean up the response to only get the assistant part
        generated_text = output[0]["generated_text"].strip()
        # Simple check to remove prompt if it's still included
        if generated_text.startswith("<start_of_turn>user"):
             # Find the start of the assistant's turn if the full conversation is returned
             assistant_start_index = generated_text.find("<start_of_turn>model")
             if assistant_start_index != -1:
                  generated_text = generated_text[assistant_start_index:].strip()


        # Remove the start/end turn tokens if they are present
        if generated_text.startswith("<start_of_turn>model"):
             generated_text = generated_text[len("<start_of_turn>model\n"):].strip()
        if generated_text.endswith("<end_of_turn>"):
             generated_text = generated_text[:-len("<end_of_turn>")].strip()


        return generated_text
    else:
        return "Could not generate response."

# Test with the specific prompt
test_question = "Explain how GRPO (Group Relative Proximal Optimization) works in Reinforcement LEarning a LLM"

print("\nTesting the models with the specific question:")

# Test original model
response_original = test_model_with_pipeline(test_question, orginal_model_name, original_tokenizer, orginal_model_name)
print(f"Original Model Response:\n{response_original}")

print("-" * 80)

# Test fine-tuned model
response_finetuned = test_model_with_pipeline(test_question, finetuned_model_path, finetuned_tokenizer, finetuned_model_path)
print(f"Finetuned Model Response:\n{response_finetuned}")

print("-" * 80)

Loading tokenizer for fine-tuned model...
Loading tokenizer for original model...

Testing the models with the specific question:

Testing model: Qwen/Qwen2.5-1.5B-Instruct


Device set to use cuda:0


Original Model Response:
Great question! Let's break down the concept of Group Relative Proximal Optimization (GRPO) and its application in Reinforcement Learning for Large Language Models.

### Understanding GRPO

**Group Relative Proximal Optimization (GRPO)** is a method used to solve optimization problems that arise in machine learning, particularly in training large language models. It is designed to handle non-convex objectives efficiently by breaking them into smaller, more manageable sub-problems. The term "proximal" refers to a technique where we minimize a proximal operator, which can be thought of as a smoothing or regularization step.

### Context: Reinforcement Learning
--------------------------------------------------------------------------------

Testing model: ./Qwen2.5-1.5B-Instruct-Sft-results/final_model


Device set to use cuda:0


Finetuned Model Response:
Okay, imagine you’re playing a game where you try to learn the best way to win by getting rewards or punishments for different actions.

GRPO is like having many smaller learning systems that work together. Each one focuses on just part of the big task but helps improve the whole.

In AI reinforcement learning, these smaller systems can quickly adapt and find good strategies faster than traditional methods.

So, GRPO speeds up learning by combining multiple small learners instead of one big one. Does this help you see how it simplifies complex problems?
--------------------------------------------------------------------------------


In [20]:
# Finish WandB run
wandb.finish()
print("Training complete! Check your WandB dashboard for training metrics.")


0,1
train/epoch,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▇▇▇██
train/global_step,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▇▇▇██
train/grad_norm,▆▅▄▃▁▆▂▄▄▅▄▄▃▄▆█▇▆
train/learning_rate,▄██▇▇▆▆▅▅▅▄▄▃▃▂▂▁▁
train/loss,█▆▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁
train/mean_token_accuracy,▁▃▅▆▆▆▇▇▇▇▇▇▇█▇▇▇▇█
train/num_tokens,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▇▇▇██

0,1
total_flos,2606770362037248.0
train/epoch,3.0
train/global_step,183.0
train/grad_norm,0.66887
train/learning_rate,0.0
train/loss,0.8311
train/mean_token_accuracy,0.77149
train/num_tokens,329373.0
train_loss,0.99594
train_runtime,1666.7865


Training complete! Check your WandB dashboard for training metrics.


In [None]:
## 8. Download Your Model (Optional)

If you want to download the trained model to your local machine:


In [22]:
# Create a zip file of the trained model
import shutil

# Zip the final model
shutil.make_archive('qwen2.5-1.5B-sft_finetuned', 'zip', final_model_path)

# Download the model
from google.colab import files
files.download('qwen2.5-1.5B-sft_finetuned.zip')

print("Model downloaded! You can now use this model locally.")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Model downloaded! You can now use this model locally.


In [None]:
## 🎉 Training Complete!

### What happened:
- ✅ Loaded Qwen2-VL-2B-Instruct model with 4-bit quantization
- ✅ Applied LoRA for efficient fine-tuning
- ✅ Trained on your first principles dataset
- ✅ Saved the model with adapters
- ✅ Tested the fine-tuned model

### Next steps:
1. **Test more extensively**: Try various prompts to evaluate performance
2. **Push to Hub**: Your model is automatically pushed to HuggingFace Hub
3. **Use the model**: Load it in your applications or continue training
4. **Iterate**: Adjust hyperparameters and retrain if needed

### Model usage:
```python
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

# Load base model
base_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
# Load your fine-tuned adapters
model = PeftModel.from_pretrained(base_model, "KhushalM/Qwen2-VL-2B-Instruct-SFT")
tokenizer = AutoTokenizer.from_pretrained("KhushalM/Qwen2-VL-2B-Instruct-SFT")
```
