<a href="https://colab.research.google.com/github/NathanScarrott/phi2-qlora-taskmodel/blob/main/notebooks/fine_tuning_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Upload dataset

In [2]:
!git clone https://github.com/NathanScarrott/phi2-qlora-taskmodel.git

Cloning into 'phi2-qlora-taskmodel'...
remote: Enumerating objects: 82, done.[K
remote: Counting objects: 100% (82/82), done.[K
remote: Compressing objects: 100% (53/53), done.[K
remote: Total 82 (delta 35), reused 65 (delta 18), pack-reused 0 (from 0)[K
Receiving objects: 100% (82/82), 73.02 KiB | 545.00 KiB/s, done.
Resolving deltas: 100% (35/35), done.


## Validate Dataset

In [20]:
import json
from datasets import Dataset

# Load your JSONL dataset
def load_jsonl(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

# Load your data
dataset_path = 'phi2-qlora-taskmodel/data/task_dataset.jsonl'
data = load_jsonl(dataset_path)

# Convert to Hugging Face dataset format
dataset = Dataset.from_list(data)

# Check dataset info
print(f"Total examples: {len(dataset)}")
print(f"Example structure: {dataset[0]}")

# Check intent distribution
intents = [item['output']['intent'] for item in data]
from collections import Counter
intent_counts = Counter(intents)
print(f"\nIntent distribution: {dict(intent_counts)}")

# Split into train/validation (80/20)
dataset = dataset.train_test_split(test_size=0.2, seed=42)

print(f"\nTraining examples: {len(dataset['train'])}")
print(f"Validation examples: {len(dataset['test'])}")

Total examples: 1200
Example structure: {'input': "Add 'pick up dry cleaning' to my to-do list for tomorrow morning", 'output': {'body': None, 'datetime': None, 'intent': 'add_task', 'location': None, 'recipient': None, 'schedule': 'tomorrow morning', 'subject': None, 'task': 'pick up dry cleaning'}}

Intent distribution: {'add_task': 424, 'get_weather': 297, 'send_email': 479}

Training examples: 960
Validation examples: 240


## Load 4 Bit Model & Tokenizer

In [4]:
# Force reinstall bitsandbytes with specific CUDA support
!pip install -q --upgrade pip
!pip install -q --no-cache-dir bitsandbytes
!pip install -q --upgrade transformers accelerate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m303.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m47.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m232.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m235.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m489.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m283.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m309.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [5]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)

# Try importing bitsandbytes
try:
    import bitsandbytes as bnb
    print("✅ bitsandbytes imported successfully")
except ImportError as e:
    print("❌ bitsandbytes import failed:", e)

CUDA available: True
CUDA version: 12.4
✅ bitsandbytes imported successfully


In [6]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

# Load model
modelpath = "microsoft/phi-2"
model = AutoModelForCausalLM.from_pretrained(
    modelpath,
    device_map="auto",
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4",
    ),
    torch_dtype=torch.bfloat16,
    # FA2 does not work yet
    # attn_implementation="flash_attention_2",
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", use_fast=False)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

## Prepare LoRA Adapters

In [8]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

# Adapter settings
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules = [ "q_proj", "k_proj", "v_proj", "dense" ],
    modules_to_save = ["lm_head", "embed_tokens"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)

model.config.use_cache = False

## Format Dataset

In [16]:
def format_prompt(sample):
    """Convert your input/output to instruction format"""
    instruction = "Convert the following user request into structured JSON format:"
    user_input = sample['input']
    expected_output = json.dumps(sample['output'])

    # Create instruction format
    prompt = f"""{instruction}

User: {user_input}

Assistant: {expected_output}
"""
    return {"text": prompt}

In [21]:
# Apply your formatting function to the dataset
dataset_new = dataset.map(format_prompt, remove_columns=['input', 'output'])

# Check what it looks like now
print("Formatted example:")
print(dataset_new['train'][0]['text'][:2000] + "...")

Map:   0%|          | 0/960 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Formatted example:
Convert the following user request into structured JSON format:

User: Send an email to my sister, title 'Party details', message 'Let me know if you can make it.'

Assistant: {"body": "Let me know if you can make it.", "datetime": null, "intent": "send_email", "location": null, "recipient": "sister", "schedule": null, "subject": "Party details", "task": null}
...
