In [None]:
import os
import wandb
import builtins
builtins.wandb = wandb  # Make wandb globally available (workaround for trl bug)

HF_USERNAME = os.getenv("HF_USERNAME")
HF_TOKEN    = os.getenv("HF_TOKEN")
WANDB_API_KEY = os.getenv("WANDB_API_KEY")
WANDB_PROJECT = os.getenv("WANDB_PROJECT")
WANDB_WATCH = os.getenv("WANDB_WATCH")

if not HF_USERNAME:
    raise ValueError("Set the environment variable HF_USERNAME")
if not HF_TOKEN:
    raise ValueError("Set the environment variable HF_TOKEN")
if not WANDB_API_KEY:
    raise ValueError("Set the environment variable WANDB_API_KEY")

# Login to wandb
wandb.login(key=WANDB_API_KEY)

print("Wandb: logged in ✓")

In [None]:
# ================== EXPERIMENT CONFIG ==================

EXPERIMENT = "1B_LORA"
# "1B_QLORA", "1B_LORA", "3B_QLORA"

EXPERIMENT_CONFIG = {
    "1B_QLORA": {
        "model_size":   "1B",
        "model_name":   "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
        "load_in_4bit": True,
        "hf_repo":      f"{HF_USERNAME}/llama3-1b-finetome",
        "run_name":     "1B_QLoRA_FineTome_N1000",
        "revision":     "1B_QLoRA_N1000",
    },
    "1B_LORA": {
        "model_size":   "1B",
        "model_name":   "unsloth/Llama-3.2-1B-Instruct",
        "load_in_4bit": False,
        "hf_repo":      f"{HF_USERNAME}/llama3-1b-finetome",
        "run_name":     "1B_LoRA_FineTome_N1000",
        "revision":     "1B_LoRA_N1000",
    },
    "3B_QLORA": {
        "model_size":   "3B",
        "model_name":   "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
        "load_in_4bit": True,
        "hf_repo":      f"{HF_USERNAME}/llama3-3b-finetome",
        "run_name":     "3B_QLoRA_FineTome_N1000",
        "revision":     "3B_QLoRA_N1000",
    }
}

CFG = EXPERIMENT_CONFIG[EXPERIMENT]

MODEL_NAME   = CFG["model_name"]
HF_REPO      = CFG["hf_repo"]
RUN_NAME     = CFG["run_name"]
REVISION     = CFG["revision"]
LOAD_IN_4BIT = CFG["load_in_4bit"]
MODEL_SIZE   = CFG["model_size"]

print(f"Experiment: {EXPERIMENT}")
print(f"Model name: {MODEL_NAME}")
print(f"HF repo:    {HF_REPO}")
print(f"Revision:   {REVISION}")


In [None]:
import torch

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))
else:
    print("⚠️ No GPU detected by PyTorch: please check your drivers and installation.")


In [None]:
from unsloth import FastLanguageModel, is_bfloat16_supported
from datasets import load_dataset
from transformers import TrainingArguments
from trl import SFTTrainer
import tqdm as notebook_tqdm

# Load Base Model


In [None]:
max_seq_length = 2048  # trade off between training speed and quality

print("Loading model:", MODEL_NAME)

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name     = MODEL_NAME,
    max_seq_length = max_seq_length,
    dtype          = None,
    load_in_4bit   = LOAD_IN_4BIT,
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
)

print("Model ready.")


# Prepare FineTome Dataset


In [None]:
# === DATASET FineTome-100k ===

DATASET_SAMPLE_SIZE = 1000

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = []
    for convo in convos:
        try:
            user_msg = convo[0]["value"]  # user
            asst_msg = convo[1]["value"]  # model
            text = alpaca_prompt.format(user_msg, "", asst_msg) + EOS_TOKEN
            texts.append(text)
        except Exception:
            # Skip malformed rows
            continue
    return {"text": texts}

print("Loading FineTome-100k dataset...")
dataset = load_dataset("mlabonne/FineTome-100k", split="train")

print("Original size:", len(dataset))

dataset = dataset.select(range(DATASET_SAMPLE_SIZE))
print("After subset:", len(dataset))

dataset = dataset.map(formatting_prompts_func, batched=True, remove_columns=dataset.column_names)
print(dataset[0])

# Training SFTTrainer


In [None]:
# === TRAINING ===

# Training parameters
batch_size = 1
gradient_accumulation_steps = 4

output_dir = f"outputs/{EXPERIMENT}"

MAX_STEPS = 120

# === CHECKPOINT RESUME LOGIC ===
resume_from_checkpoint = None

if os.path.exists(output_dir):
    checkpoints = [d for d in os.listdir(output_dir) if d.startswith("checkpoint-")]
    if checkpoints:
        # Find the latest checkpoint by step number
        latest_checkpoint = max(checkpoints, key=lambda x: int(x.split("-")[1]))
        resume_from_checkpoint = os.path.join(output_dir, latest_checkpoint)
        print(f"✓ Resuming from checkpoint: {resume_from_checkpoint}")
    else:
        print("→ Starting fresh training - no checkpoints found in output dir")
else:
    print("→ Starting fresh training - output directory does not exist yet")

# Training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    warmup_steps=5,
    max_steps=MAX_STEPS,
    learning_rate=2e-4,
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    logging_steps=5,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    save_steps=20,
    save_total_limit=3,
    report_to=["wandb"],
    run_name=RUN_NAME,
)

trainer = SFTTrainer(
    model              = model,
    tokenizer          = tokenizer,
    train_dataset      = dataset,
    dataset_text_field = "text",
    max_seq_length     = max_seq_length,
    dataset_num_proc   = 2,
    packing            = False,
    args = training_args,
)

# Train with optional checkpoint resume
trainer.train(resume_from_checkpoint=resume_from_checkpoint)


## Quick Test


In [None]:
from unsloth import FastLanguageModel

# Put the model in inference mode (required for Unsloth)
FastLanguageModel.for_inference(model)

def chat(prompt: str) -> str:
    # 1) Structured conversation: system + user
    messages = [
        {"role": "system", "content": "You are a helpful AI assistant."},
        {"role": "user", "content": prompt},
    ]

    # 2) Apply model's chat template (uses correct tokens)
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to(model.device)
    
    # Create attention mask (all 1s since no padding)
    attention_mask = torch.ones_like(input_ids)

    # 3) Generation
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=500,   # enough space for complete answers
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
        )

    # 4) Decode
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return text

# TEST
response = chat(
    "Explain the difference between supervised fine-tuning and preference alignment in simple terms."
)
print(response)


# Push Adapters to HF


In [None]:
# === PUSH ADAPTERS + TOKENIZER TO HF ===

# Ask for confirmation before pushing to the HF repo
# confirm = input(f"Are you sure you want to push adapters/tokenizer to the Hugging Face repo '{HF_REPO}', revision '{REVISION}'? [y/N]: ").strip().lower()
# if confirm not in ("y", "yes"):
#     print("❌ Push canceled.")
#     import sys; sys.exit(0)

model.push_to_hub(
    HF_REPO,
    token=HF_TOKEN,
    revision=REVISION,
)
tokenizer.push_to_hub(
    HF_REPO,
    token=HF_TOKEN,
    revision=REVISION,
)

print("Adapters and tokenizer uploaded to:", HF_REPO)


## Save Adapters Locally


In [None]:
# === SAVE ADAPTERS LOCALLY ===

output_dir = f"models/adapters_{EXPERIMENT}"
os.makedirs(output_dir, exist_ok=True)

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print("Saved locally in:", output_dir)


# Export model in GGUF


## Pre-download Full precision (first time only)


In [None]:
# === PRE-DOWNLOAD BASE MODEL (16-bit) FOR GGUF ===
# Run this ONCE - next time it will use the cache

from huggingface_hub import snapshot_download

base_model_id = MODEL_NAME.replace("-bnb-4bit", "")  # remove quantization

print(f"Downloading {base_model_id} to cache...")
snapshot_download(
    repo_id=base_model_id,
    local_dir_use_symlinks=True,
    resume_download=True,
)
print("✅ Base model cached! Next GGUF exports will be faster.")


# Export model in GGUF


In [None]:
# === EXPORT GGUF ===

# (If you want to free up RAM first, you can delete the model and trainer and reload from HF,
#  but with 3B and 8GB locally you can also export directly.)

# from unsloth import FastLanguageModel

# If you deleted, reload like this:
# model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name     = repo_id,
#     max_seq_length = max_seq_length,
#     dtype          = None,
#     load_in_4bit   = True,
# )


# Ask for confirmation before pushing to the HF repo
# confirm = input(f"Are you sure you want to push GGUF to the Hugging Face repo '{HF_REPO}', revision '{REVISION}'? [y/N]: ").strip().lower()
# if confirm not in ("y", "yes"):
#     print("❌ Push canceled.")
#     import sys; sys.exit(0)

gguf_dir = f"models/gguf_{EXPERIMENT}"
os.makedirs(gguf_dir, exist_ok=True)

print("Saving GGUF locally to:", gguf_dir)

# Save locally first
model.save_pretrained_gguf(
    gguf_dir,
    tokenizer,
    quantization_method="q4_k_m",
)

print(f"✅ GGUF saved locally in: {gguf_dir}")

print("Exporting GGUF to:", HF_REPO)

model.push_to_hub_gguf(
    HF_REPO,
    tokenizer,
    quantization_method = "q4_k_m",
    token = HF_TOKEN,
    revision=REVISION,
)

print("✅ GGUF exported. Go to HF -> Files and versions to see the .gguf file")
