# MediRep Voice AI - Fine-Tuning with Unsloth

**Before running:**
1. Upload `train.jsonl` and `val.jsonl` from `voice/data/final/`
2. Set Runtime > Change runtime type > T4 GPU (free) or A100 (faster)

**Training data:** 1872 conversations with tool-calling patterns

In [None]:
# Cell 1: Install Unsloth
# Install from PyPI (stable) - NOT from git (git main requires unreleased unsloth_zoo)
!pip install unsloth -q
!pip install "unsloth_zoo==2026.1.4" -q
!pip install --no-deps trl peft accelerate bitsandbytes -q

# Verify install
import unsloth
print(f"Unsloth version: {unsloth.__version__}")

In [None]:
# Cell 2: Load Base Model
from unsloth import FastLanguageModel
import torch

# Llama-3.2-3B - fits in T4 GPU (16GB), good quality
model_name = "unsloth/Llama-3.2-3B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=2048,
    dtype=None,  # Auto-detect
    load_in_4bit=True,  # Reduces memory 4x
)

print(f"Loaded {model_name}")

In [None]:
# Cell 3: Add LoRA Adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=32,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
)

print("LoRA adapters added (r=32)")

In [None]:
# Cell 4: Upload and Load Dataset
from google.colab import files
from datasets import load_dataset

print("Upload train.jsonl and val.jsonl from voice/data/final/")
uploaded = files.upload()

dataset = load_dataset("json", data_files={
    "train": "train.jsonl",
    "validation": "val.jsonl"
})

print(f"Train: {len(dataset['train'])} | Val: {len(dataset['validation'])}")

In [None]:
# Cell 5: Format Dataset - KEEP tool_result so model learns to read them
import json
from datasets import load_dataset

dataset = load_dataset("json", data_files={
    "train": "train.jsonl",
    "validation": "val.jsonl"
})

# Noisy fields to strip from tool results
NOISE_FIELDS = [
    "id", "scheme_id", "created_at", "updated_at",
    "procedure_name_normalized", "data_source"
]

def format_chat(example):
    messages = example["conversations"]

    cleaned = []
    for m in messages:
        if m["role"] == "tool_result":
            try:
                data = json.loads(m["content"])
                for key in NOISE_FIELDS:
                    data.pop(key, None)
                # Tool result as a user message so chat template accepts it
                cleaned.append({"role": "user", "content": f"[Tool Result]: {json.dumps(data)}"})
            except:
                cleaned.append({"role": "user", "content": f"[Tool Result]: {m['content'][:500]}"})
        else:
            cleaned.append(m)

    text = tokenizer.apply_chat_template(cleaned, tokenize=False, add_generation_prompt=False)
    return {"text": text}

dataset = dataset.map(format_chat)
print("Dataset formatted WITH tool results")
print(f"Train: {len(dataset['train'])} | Val: {len(dataset['validation'])}")
print(f"Sample:\n{dataset['train'][0]['text'][:800]}...")

In [None]:
# Cell 6: Training Configuration
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"
import transformers
transformers.integrations.integration_utils.is_wandb_available = lambda: False

from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    dataset_text_field="text",
    max_seq_length=2048,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=20,
        num_train_epochs=5,
        learning_rate=1e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        eval_strategy="steps",
        eval_steps=50,
        save_strategy="steps",
        save_steps=100,
    ),
)

print("Trainer configured (5 epochs, lr=1e-4, r=32)")

In [None]:
# Cell 7: Train!
print("Starting training...")
trainer_stats = trainer.train()
print(f"Training complete! Loss: {trainer_stats.training_loss:.4f}")

In [None]:
# Cell 8: Test the Model
FastLanguageModel.for_inference(model)

test_messages = [
    {"role": "user", "content": "What is the PM-JAY rate for knee replacement?"}
]

inputs = tokenizer.apply_chat_template(
    test_messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt"
).to("cuda")

outputs = model.generate(
    input_ids=inputs,
    max_new_tokens=256,
    temperature=0.7,
    do_sample=True,
)

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Test response:")
print(response)

In [None]:
# Cell 9: Export to GGUF (for local deployment)
print("Exporting to GGUF format...")
model.save_pretrained_gguf(
    "medirep-voice",
    tokenizer,
    quantization_method="q4_k_m"  # Good balance of size/quality
)
print("Exported to medirep-voice/")

In [None]:
# Cell 10: Download the Model
!zip -r medirep-voice.zip medirep-voice/
files.download("medirep-voice.zip")
print("Download complete! Extract and use with llama.cpp or llama-cpp-python")