## Setup: imports and environment checks

In [None]:
import sys, gc
from pathlib import Path
import json

# Add project root to path (same pattern as notebooks/01_data_factory.ipynb)
project_root = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
sys.path.insert(0, str(project_root))

import torch

def print_vram():
    """Print current GPU VRAM usage."""
    if torch.cuda.is_available():
        alloc = torch.cuda.memory_allocated() / 1024**3
        reserved = torch.cuda.memory_reserved() / 1024**3
        total = torch.cuda.get_device_properties(0).total_mem / 1024**3
        print(f"  VRAM: {alloc:.1f}GB allocated / {reserved:.1f}GB reserved / {total:.1f}GB total")

def clear_vram():
    """Free unused VRAM."""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

print(f"✓ Project root: {project_root}")
print(f"✓ torch: {torch.__version__}")
print(f"✓ CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"✓ GPU: {torch.cuda.get_device_name(0)}")
    print_vram()

# QLoRA needs bitsandbytes (usually Linux/Colab)
try:
    import bitsandbytes as bnb  # noqa: F401
    print("✓ bitsandbytes import: OK")
except Exception as e:
    print("✗ bitsandbytes import failed. If you are on Windows, run this notebook in Colab or WSL.")
    print("  Error:", repr(e))


## Hugging Face Login

In [None]:
from huggingface_hub import login

# Option 1: Login via Colab secrets (recommended)
try:
    from google.colab import userdata
    hf_token = userdata.get("HF_TOKEN")
    login(token=hf_token)
    print("✓ Logged in to Hugging Face via Colab secret (HF_TOKEN)")
except Exception:
    # Option 2: Login via environment variable
    import os
    hf_token = os.environ.get("HF_TOKEN", "")
    if hf_token:
        login(token=hf_token)
        print("✓ Logged in to Hugging Face via HF_TOKEN env variable")
    else:
        # Option 3: Interactive login (will prompt for token)
        print("⚠ No HF_TOKEN found. Running interactive login...")
        print("  Get your token at: https://huggingface.co/settings/tokens")
        login()
        print("✓ Logged in to Hugging Face interactively")

## Load config + paths

In [None]:
from src.utils.config_loader import load_config
from src.finetuning.data import resolve_finetune_paths

config_path = project_root / "config" / "config.yaml"
config = load_config(config_path)
paths = resolve_finetune_paths(config)

print("✓ Base model:", config["finetuning"]["base_model"])
print("✓ Train file:", paths.train_file)
print("✓ Eval file:", paths.eval_file)
print("✓ Output dir:", paths.output_dir)

print("✓ Files exist:")
print("  - train:", paths.train_file.exists())
print("  - eval:", paths.eval_file.exists() if paths.eval_file else None)


## Setup: BitsAndBytesConfig + LoraConfig (QLoRA)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTConfig, SFTTrainer

from src.finetuning.data import load_sft_datasets

ft_cfg = config["finetuning"]
tr_cfg = ft_cfg["training"]

base_model = ft_cfg["base_model"]
compute_dtype_str = ft_cfg.get("quantization", {}).get("compute_dtype", "float16")
compute_dtype = torch.float16 if str(compute_dtype_str).lower() in {"float16", "fp16"} else torch.bfloat16

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type=ft_cfg.get("quantization", {}).get("quant_type", "nf4"),
    bnb_4bit_use_double_quant=bool(ft_cfg.get("quantization", {}).get("double_quant", True)),
    bnb_4bit_compute_dtype=compute_dtype,
)

lora_config = LoraConfig(
    r=int(ft_cfg.get("lora", {}).get("r", 16)),
    lora_alpha=int(ft_cfg.get("lora", {}).get("alpha", 32)),
    lora_dropout=float(ft_cfg.get("lora", {}).get("dropout", 0.05)),
    target_modules=list(ft_cfg.get("lora", {}).get("target_modules", ["q_proj", "k_proj", "v_proj", "o_proj"])),
    bias=str(ft_cfg.get("lora", {}).get("bias", "none")),
    task_type=str(ft_cfg.get("lora", {}).get("task_type", "CAUSAL_LM")),
)

print("✓ BitsAndBytesConfig:", bnb_config)
print("✓ LoraConfig target_modules:", lora_config.target_modules)

# Tokenizer
trust_remote_code = bool(config.get("providers", {}).get("huggingface", {}).get("trust_remote_code", True))
tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True, trust_remote_code=trust_remote_code)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Dataset → SFT-ready `text`
max_seq_length = int(tr_cfg.get("max_seq_length", 2048))
ds = load_sft_datasets(
    train_file=paths.train_file,
    eval_file=paths.eval_file,
    tokenizer=tokenizer,
    max_seq_length=max_seq_length,
)

# Free any stale VRAM before loading the model
clear_vram()

# Model (4-bit) + LoRA
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map=config.get("providers", {}).get("huggingface", {}).get("device_map", "auto"),
    trust_remote_code=trust_remote_code,
    torch_dtype=compute_dtype,
    low_cpu_mem_usage=True,
)
model.config.use_cache = False

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

trainable, total = 0, 0
for p in model.parameters():
    total += p.numel()
    if p.requires_grad:
        trainable += p.numel()

print("✓ Model prepared for QLoRA")
print(f"  Trainable params: {trainable:,} / {total:,} ({100*trainable/total:.2f}%)")
print_vram()

## Training: SFTTrainer loop (≥ 100 steps)

In [None]:
# For speed while testing, you can uncomment the next two lines.
# ds["train"] = ds["train"].select(range(min(256, len(ds["train"]))))
# if "eval" in ds: ds["eval"] = ds["eval"].select(range(min(64, len(ds["eval"]))))

out_dir = Path(paths.output_dir)
out_dir.mkdir(parents=True, exist_ok=True)

min_steps = int(tr_cfg.get("min_steps", 100))
max_steps = max(min_steps, int(tr_cfg.get("max_steps", min_steps)))

sft_args = SFTConfig(
    output_dir=str(out_dir),
    max_steps=max_steps,
    num_train_epochs=float(tr_cfg.get("num_epochs", 1)),
    per_device_train_batch_size=int(tr_cfg.get("batch_size", 1)),
    gradient_accumulation_steps=int(tr_cfg.get("gradient_accumulation_steps", 1)),
    learning_rate=float(tr_cfg.get("learning_rate", 2e-4)),
    warmup_ratio=float(tr_cfg.get("warmup_ratio", 0.03)),
    logging_steps=int(tr_cfg.get("logging_steps", 10)),
    save_steps=int(tr_cfg.get("save_steps", 50)),
    max_seq_length=max_seq_length,
    packing=False,
    fp16=compute_dtype == torch.float16,
    bf16=compute_dtype == torch.bfloat16,
    gradient_checkpointing=True,
    optim=str(tr_cfg.get("optim", "paged_adamw_8bit")),  # 8-bit optimizer saves ~1-2GB VRAM
    report_to=[],
)

trainer = SFTTrainer(
    model=model,
    args=sft_args,
    train_dataset=ds["train"],
    eval_dataset=ds.get("eval"),
    tokenizer=tokenizer,
    dataset_text_field="text",
)

print(f"✓ SFTTrainer ready — batch_size={sft_args.per_device_train_batch_size}, "
      f"grad_accum={sft_args.gradient_accumulation_steps}, "
      f"max_seq_length={sft_args.max_seq_length}, "
      f"optim={sft_args.optim}")
print_vram()

train_result = trainer.train()
print("✓ Train result:", train_result)
print_vram()

# Save adapters + tokenizer
trainer.model.save_pretrained(str(out_dir))
tokenizer.save_pretrained(str(out_dir))

adapter_dir = out_dir
print("✓ Saved adapters to:", adapter_dir)
print("✓ Adapter files:")
for p in sorted(Path(adapter_dir).glob("*")):
    print(" -", p.name)


## Training: loss curve

In [None]:
import matplotlib.pyplot as plt

log_history = getattr(trainer.state, "log_history", [])
steps = [x["step"] for x in log_history if "loss" in x and "step" in x]
losses = [x["loss"] for x in log_history if "loss" in x and "step" in x]

if not losses:
    print("No loss entries found in trainer.state.log_history (try lowering logging_steps).")
else:
    plt.figure(figsize=(8, 4))
    plt.plot(steps, losses)
    plt.title("Training Loss")
    plt.xlabel("Step")
    plt.ylabel("Loss")
    plt.grid(True)
    plt.show()


## Inference pipeline: query_intern(question)

In [None]:
#
# Free the training model from VRAM first, then load the saved adapters
# through the standalone query_intern() inference pipeline.

# --- Free training model to reclaim VRAM ---
import gc
del trainer, model
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
print("✓ Training model freed from VRAM")
print_vram()

# --- Load inference pipeline ---
from src.finetuning.intern_inference import query_intern

# ──────────────────────────────────────────────────────────────
# Demo 1: Simple call — query_intern(question)
# This is the primary API. It loads base model + saved LoRA
# adapters automatically from config defaults.
# ──────────────────────────────────────────────────────────────
print("=" * 60)
print("Demo 1: query_intern(question)  [no context]")
print("=" * 60)
answer = query_intern("What is Uber's Adjusted EBITDA for 2024?")
print(f"\nQuestion: What is Uber's Adjusted EBITDA for 2024?")
print(f"Answer:   {answer}")

# ──────────────────────────────────────────────────────────────
# Demo 2: With context — query_intern(question, chunk_text=...)
# Pass a chunk as grounding context (used during RAG / eval).
# ──────────────────────────────────────────────────────────────
print("\n" + "=" * 60)
print("Demo 2: query_intern(question, chunk_text=...)  [with context]")
print("=" * 60)

example = None
if paths.eval_file and paths.eval_file.exists():
    with open(paths.eval_file, "r", encoding="utf-8") as f:
        example = json.loads(next(iter(f)))

if example:
    q = example["question"]
    ctx = example.get("chunk_text")
    gt = example.get("answer")

    pred = query_intern(q, chunk_text=ctx, config_path=config_path)

    print(f"\nQuestion:     {q}")
    print(f"Ground truth: {gt}")
    print(f"Intern:       {pred}")
else:
    pred = query_intern("What is Adjusted EBITDA?", config_path=config_path)
    print(f"\nIntern: {pred}")

# ──────────────────────────────────────────────────────────────
# Demo 3: "Information not available" — question outside context
# ──────────────────────────────────────────────────────────────
print("\n" + "=" * 60)
print("Demo 3: Out-of-scope question (should say info not available)")
print("=" * 60)
oos = query_intern(
    "What was Apple's revenue in 2024?",
    chunk_text="Uber's total revenue for 2024 was $43.9 billion.",
    config_path=config_path,
)
print(f"\nQuestion: What was Apple's revenue in 2024?")
print(f"Context:  Uber's total revenue for 2024 was $43.9 billion.")
print(f"Intern:   {oos}")

print("\n✓ Inference pipeline verified")
print_vram()


In [None]:
## (Optional) Format preview: inspect a single SFT training text

from transformers import AutoTokenizer
from src.finetuning.data import format_sft_record

base_model = config["finetuning"]["base_model"]
tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)

with open(paths.train_file, "r", encoding="utf-8") as f:
    rec = json.loads(next(iter(f)))

formatted = format_sft_record(rec, tokenizer, max_seq_length=config["finetuning"]["training"]["max_seq_length"])
print(formatted["text"][:1200])
