# QLoRA Train + Eval


## Summary
- Purpose: train a QLoRA adapter and evaluate it with the same harness used for baseline runs.
- Scope: includes data checks, experiment configuration, training, and evaluation planning.
- Outputs: adapter checkpoints plus evaluation JSON files under `results/adapters/` and `results/qlora/`.


## 0) Bootstrap


In [None]:

%%bash
set -e
export PIP_DEFAULT_TIMEOUT=120

# Clean conflicting preinstalls
pip uninstall -y torch torchvision torchaudio bitsandbytes triton transformers accelerate peft trl datasets numpy pandas fsspec requests google-auth || true

# Base deps
pip install -q --no-cache-dir --force-reinstall   numpy==1.26.4 pandas==2.2.1 fsspec==2024.5.0 requests==2.31.0 google-auth==2.43.0

# Torch + CUDA 12.1
pip install -q --no-cache-dir --force-reinstall   torch==2.3.1+cu121 torchvision==0.18.1+cu121 torchaudio==2.3.1+cu121   --index-url https://download.pytorch.org/whl/cu121

# bitsandbytes + triton + HF stack
pip install -q --no-cache-dir --force-reinstall   bitsandbytes==0.43.3 triton==2.3.1   transformers==4.44.2 accelerate==0.33.0 peft==0.17.0 trl==0.9.6 datasets==2.20.0

echo "Setup complete. Restart runtime once, then run the rest of the notebook."


## 1) Sync Repo


In [None]:
import os, sys, shutil
from pathlib import Path

# If opened directly in Colab, clone the repo first
if Path("data/classicmodels_test_200.json").exists() is False and Path("/content").exists():
    repo_dir = Path("/content/NLtoSQL")
    if repo_dir.exists():
        shutil.rmtree(repo_dir)
    !git clone https://github.com/MacKenzieOBrian/NLtoSQL.git "{repo_dir}"
    os.chdir(repo_dir)

sys.path.insert(0, os.getcwd())
print("cwd:", os.getcwd())

# Ensure DB/eval extras are present (Cloud SQL connector, SQLAlchemy, PyMySQL).
!pip -q install -r requirements.txt
print("requirements.txt installed")


## 2) Auth


In [None]:
# GCP auth (Colab) â€” safe to skip locally if using ADC
try:
    from google.colab import auth
except ModuleNotFoundError:
    auth = None
if auth:
    auth.authenticate_user()
else:
    print("Not running in Colab; ensure ADC/service account auth is configured.")

# Hugging Face auth
hf_token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
if hf_token:
    os.environ["HUGGINGFACE_HUB_TOKEN"] = hf_token
    print("Using HF token from env")
else:
    try:
        from huggingface_hub import notebook_login
        notebook_login()
    except Exception as e:
        print("HF auth not configured:", e)


## 3) Load Data


In [None]:
import json
from pathlib import Path

test_path = Path("data/classicmodels_test_200.json")
train_path = Path("data/train/classicmodels_train_200.jsonl")

test_set = json.loads(test_path.read_text(encoding="utf-8"))
print("Test items:", len(test_set))

if not train_path.exists():
    raise FileNotFoundError(
        f"Missing training set at {train_path}. Create it before running QLoRA. "
        "Expected JSONL lines with keys: nlq, sql."
    )

train_records = []
for line in train_path.read_text(encoding="utf-8").splitlines():
    line = line.strip()
    if not line:
        continue
    train_records.append(json.loads(line))

print("Train items:", len(train_records))


## 3A) Leakage Guard


In [None]:
test_nlqs = {item["nlq"].strip() for item in test_set}
train_nlqs = [r.get("nlq", "").strip() for r in train_records]
overlap = sorted({nlq for nlq in train_nlqs if nlq in test_nlqs})

print("NLQ overlap count:", len(overlap))
if overlap:
    print("Example overlaps:")
    for x in overlap[:10]:
        print("-", x)
    raise ValueError("Training set overlaps test set; remove overlapping items before training.")


## 4) DB + Schema


In [None]:
from getpass import getpass
from nl2sql.db import create_engine_with_connector
from nl2sql.schema import build_schema_summary

INSTANCE_CONNECTION_NAME = os.getenv("INSTANCE_CONNECTION_NAME")
DB_USER = os.getenv("DB_USER")
DB_PASS = os.getenv("DB_PASS")
DB_NAME = os.getenv("DB_NAME", "classicmodels")

if not INSTANCE_CONNECTION_NAME:
    INSTANCE_CONNECTION_NAME = input("Enter INSTANCE_CONNECTION_NAME: ").strip()
if not DB_USER:
    DB_USER = input("Enter DB_USER: ").strip()
if not DB_PASS:
    DB_PASS = getpass("Enter DB_PASS: ")

engine, connector = create_engine_with_connector(
    instance_connection_name=INSTANCE_CONNECTION_NAME,
    user=DB_USER,
    password=DB_PASS,
    db_name=DB_NAME,
)

SCHEMA_SUMMARY = build_schema_summary(engine, db_name=DB_NAME, max_cols_per_table=50)
print("Schema summary length:", len(SCHEMA_SUMMARY))


## 5) Experiment Config


In [None]:
import torch
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# ============================
# Experiment Registry (edit here)
# ============================
# each preset keeps model and training knobs bundled together.
# switch only ACTIVE_EXPERIMENT for a clean model swap.
# provenance: these defaults follow qlora/lora practice + hf peft examples,
# then are held constant across llama/qwen for fair comparison.
# batch/accum/seq are constrained by colab gpu memory limits.
EXPERIMENT_PRESETS = {
    "llama3_8b": {
        "label": "Llama-3-8B QLoRA",
        "model_id": "meta-llama/Meta-Llama-3-8B-Instruct",
        "adapter_output_dir": "results/adapters/qlora_llama3_8b_classicmodels",
        "lora_r": 32,
        "lora_alpha": 64,
        "lora_dropout": 0.05,
        "target_modules": ["q_proj", "v_proj"],
        "train_batch_size": 1,
        "grad_accum_steps": 8,
        "learning_rate": 1e-4,
        "num_train_epochs": 3,
        "warmup_ratio": 0.05,
        "max_seq_length": 1024,
        "save_steps": 200,
        "save_total_limit": 2,
    },
    "qwen2_5_7b": {
        "label": "Qwen2.5-7B QLoRA",
        "model_id": "Qwen/Qwen2.5-7B-Instruct",
        "adapter_output_dir": "results/adapters/qlora_qwen2_5_7b_classicmodels",
        "lora_r": 32,
        "lora_alpha": 64,
        "lora_dropout": 0.05,
        "target_modules": ["q_proj", "v_proj"],
        "train_batch_size": 1,
        "grad_accum_steps": 8,
        "learning_rate": 1e-4,
        "num_train_epochs": 3,
        "warmup_ratio": 0.05,
        "max_seq_length": 1024,
        "save_steps": 200,
        "save_total_limit": 2,
    },
}

# Change this one line per training run:
ACTIVE_EXPERIMENT = "qwen2_5_7b"  # options: llama3_8b, qwen2_5_7b

EXPERIMENT_CONFIG = EXPERIMENT_PRESETS[ACTIVE_EXPERIMENT]
MODEL_ID = EXPERIMENT_CONFIG["model_id"]
output_dir = EXPERIMENT_CONFIG["adapter_output_dir"]

# keep these knobs consistent across models unless you are intentionally running an ablation.
print("Experiment:", EXPERIMENT_CONFIG["label"])
print("MODEL_ID:", MODEL_ID)
print("Adapter output:", output_dir)
print(
    "Train knobs:",
    {
        "batch": EXPERIMENT_CONFIG["train_batch_size"],
        "grad_accum": EXPERIMENT_CONFIG["grad_accum_steps"],
        "lr": EXPERIMENT_CONFIG["learning_rate"],
        "epochs": EXPERIMENT_CONFIG["num_train_epochs"],
        "max_seq": EXPERIMENT_CONFIG["max_seq_length"],
        "lora_r": EXPERIMENT_CONFIG["lora_r"],
        "lora_alpha": EXPERIMENT_CONFIG["lora_alpha"],
        "lora_dropout": EXPERIMENT_CONFIG["lora_dropout"],
        "target_modules": EXPERIMENT_CONFIG["target_modules"],
    },
)

if not torch.cuda.is_available():
    raise RuntimeError(
        "CUDA is not available. In Colab, switch to GPU runtime: Runtime -> Change runtime type -> GPU."
    )

cc_major, cc_minor = torch.cuda.get_device_capability(0)
use_bf16 = cc_major >= 8
compute_dtype = torch.bfloat16 if use_bf16 else torch.float16

print("GPU:", torch.cuda.get_device_name(0))
print("Compute capability:", (cc_major, cc_minor))
print("Using bf16:", use_bf16, "| compute_dtype:", compute_dtype)

tok = AutoTokenizer.from_pretrained(MODEL_ID, token=True)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token

bnb_config = BitsAndBytesConfig(
# 4-bit nf4 + double quantization comes from qlora to fit 7b/8b models in vram.
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
)

device_map = {"": 0}

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    torch_dtype=compute_dtype,
    device_map=device_map,
    token=True,
)

# Deterministic defaults for later evaluation
base_model.generation_config.do_sample = False
base_model.generation_config.temperature = 1.0
base_model.generation_config.top_p = 1.0
base_model.generation_config.top_k = 50

base_model = prepare_model_for_kbit_training(base_model)

lora_config = LoraConfig(
# lora r/alpha/dropout + q_proj/v_proj are standard peft choices for causal lm tuning.
    r=EXPERIMENT_CONFIG["lora_r"],
    lora_alpha=EXPERIMENT_CONFIG["lora_alpha"],
    lora_dropout=EXPERIMENT_CONFIG["lora_dropout"],
    target_modules=EXPERIMENT_CONFIG["target_modules"],
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

Path(output_dir).mkdir(parents=True, exist_ok=True)


## 6) Build SFT Data


In [None]:
from datasets import Dataset
from nl2sql.prompting import SYSTEM_INSTRUCTIONS

def format_example(nlq: str, sql: str) -> str:
    messages = [
        {"role": "system", "content": SYSTEM_INSTRUCTIONS},
        {"role": "user", "content": "Schema:\n" + SCHEMA_SUMMARY},
        {"role": "user", "content": f"NLQ: {nlq}"},
        {"role": "assistant", "content": sql.rstrip(";") + ";"},
    ]
    return tok.apply_chat_template(messages, tokenize=False)

train_texts = [format_example(r["nlq"], r["sql"]) for r in train_records]
train_ds = Dataset.from_dict({"text": train_texts})
print(train_ds)


## 7) Train Adapters


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from datetime import datetime, timezone
import json
from pathlib import Path

run_card = {
    "timestamp_utc": datetime.now(timezone.utc).isoformat(),
    "experiment_key": ACTIVE_EXPERIMENT,
    "experiment_label": EXPERIMENT_CONFIG["label"],
    "model_id": MODEL_ID,
    "output_dir": output_dir,
    "train_batch_size": EXPERIMENT_CONFIG["train_batch_size"],
    "grad_accum_steps": EXPERIMENT_CONFIG["grad_accum_steps"],
    "learning_rate": EXPERIMENT_CONFIG["learning_rate"],
    "num_train_epochs": EXPERIMENT_CONFIG["num_train_epochs"],
    "warmup_ratio": EXPERIMENT_CONFIG["warmup_ratio"],
    "max_seq_length": EXPERIMENT_CONFIG["max_seq_length"],
    "lora_r": EXPERIMENT_CONFIG["lora_r"],
    "lora_alpha": EXPERIMENT_CONFIG["lora_alpha"],
    "lora_dropout": EXPERIMENT_CONFIG["lora_dropout"],
    "target_modules": EXPERIMENT_CONFIG["target_modules"],
    "precision": "bf16" if use_bf16 else "fp16",
}
print("Training run card\n", json.dumps(run_card, indent=2))

training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=EXPERIMENT_CONFIG["train_batch_size"],
    gradient_accumulation_steps=EXPERIMENT_CONFIG["grad_accum_steps"],
    learning_rate=EXPERIMENT_CONFIG["learning_rate"],
    num_train_epochs=EXPERIMENT_CONFIG["num_train_epochs"],
    warmup_ratio=EXPERIMENT_CONFIG["warmup_ratio"],
    logging_steps=10,
    save_steps=EXPERIMENT_CONFIG["save_steps"],
    save_total_limit=EXPERIMENT_CONFIG["save_total_limit"],
    bf16=use_bf16,
    fp16=(not use_bf16),
    optim="paged_adamw_8bit",
    report_to=[],
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tok,
    train_dataset=train_ds,
    dataset_text_field="text",
    args=training_args,
    max_seq_length=EXPERIMENT_CONFIG["max_seq_length"],
)

trainer.train()
trainer.model.save_pretrained(output_dir)
tok.save_pretrained(output_dir)

# Save run-card next to adapters for audit trail.
Path(output_dir).mkdir(parents=True, exist_ok=True)
(Path(output_dir) / "run_card.json").write_text(json.dumps(run_card, indent=2), encoding="utf-8")
print("Saved adapters to:", output_dir)


## 8) Eval Helpers


### Why This QLoRA Grid Design
This section keeps QLoRA evaluation policy explicit so results are defendable.

Design decisions to call out while presenting:
1. Same evaluation harness (`eval_run`) as baseline is reused to isolate tuning effects from scoring-code changes.
2. Function parameters include model and adapter context (`model_id`, `model_alias`, `adapter_dir`, `eval_model`) for reproducible provenance.
3. Guardrails check required globals before launching long runs, so missing setup fails fast.
4. Prompt/schema/exemplar knobs mirror baseline by design; method comparisons stay aligned.
5. Connector caching (`lru_cache`) is kept for TS replicas to reduce repeated connection overhead.
6. Prompt override + connector cleanup uses `try/finally` to prevent notebook-state drift after failures.
7. Canonical/model-family file copies use primary-seed policy to keep downstream comparison paths stable.
8. Per-run and grouped CSV outputs are both written: one for auditability, one for summary analysis.


In [None]:
# Orchestration helpers for QLoRA evaluation sweeps.
# Core scoring/execution logic is imported from nl2sql.
# Why these imports are here:
# - re: alias sanitization + exemplar strategy regex checks
# - subprocess: git commit provenance for run artifacts
# - shutil/pathlib: deterministic artifact copy paths
# - lru_cache: TS engine reuse for speed and stability
# - pandas: per-run and grouped reporting tables
import re
import subprocess
import shutil
from datetime import datetime, timezone
from functools import lru_cache
from pathlib import Path

import pandas as pd
from nl2sql.eval import eval_run
from nl2sql.db import create_engine_with_connector
import nl2sql.prompting as prompting_mod

DEFAULT_SYSTEM_INSTRUCTIONS = prompting_mod.SYSTEM_INSTRUCTIONS

# Prompt variants used for controlled prompt-ablation runs.
PROMPT_VARIANTS = {
    "default": DEFAULT_SYSTEM_INSTRUCTIONS,
    "schema_only_minimal": """You are an expert data analyst writing MySQL queries.
Given the database schema and a natural language question, write a single SQL SELECT query.

Rules:
- Output ONLY SQL (no explanation, no markdown).
- Output exactly ONE statement, starting with SELECT.
- Use only tables/columns listed in the schema.
""",
    "no_routing_hints": DEFAULT_SYSTEM_INSTRUCTIONS.split("- Routing hints:")[0].rstrip(),
}


# Convert HF model IDs into filesystem-safe aliases.
def _model_alias_from_id(model_id: str) -> str:
    tail = (model_id or "model").split("/")[-1]
    alias = re.sub(r"[^a-z0-9]+", "_", tail.lower()).strip("_")
    return alias or "model"


# Schema truncation variants for schema-ablation experiments.
def schema_variant_text(schema_text: str, variant: str) -> str:
    lines = schema_text.splitlines()
    if variant == "full":
        return schema_text
    if variant == "first_80_lines":
        return "\n".join(lines[:80])
    if variant == "first_40_lines":
        return "\n".join(lines[:40])
    raise ValueError(f"Unknown SCHEMA_VARIANT: {variant}")


# Exemplar-pool strategies for few-shot ablations.
def exemplar_pool_for_strategy(items: list[dict], strategy: str) -> list[dict]:
    if strategy == "all":
        return list(items)

    def _sql(x):
        return str(x.get("sql", "")).strip()

    def _is_join(sql: str) -> bool:
        s = sql.lower()
        return " join " in f" {s} "

    def _is_agg(sql: str) -> bool:
        return bool(re.search(r"\b(sum|avg|count|min|max)\s*\(", sql.lower()))

    if strategy == "brief_sql":
        ranked = sorted(items, key=lambda x: len(_sql(x)))
        keep = max(50, int(0.4 * len(ranked)))
        pool = ranked[:keep]
    elif strategy == "join_heavy":
        pool = [x for x in items if _is_join(_sql(x))]
    elif strategy == "agg_heavy":
        pool = [x for x in items if _is_agg(_sql(x))]
    else:
        raise ValueError(f"Unknown EXEMPLAR_STRATEGY: {strategy}")

    return pool if len(pool) >= 10 else list(items)


# Ensure output root exists before writing run artifacts.
Path("results/qlora").mkdir(parents=True, exist_ok=True)


# Main sweep runner: executes k/seed grid and writes JSON+CSV artifacts.
def run_qlora_grid(
    *,
    model_id: str,
    model_alias: str,
    adapter_dir: str,
    eval_model,
    k_values: list[int],
    seeds: list[int],
    run_tag: str,
    prompt_variant: str = "default",
    schema_variant: str = "full",
    exemplar_strategy: str = "all",
    limit: int | None = None,
    copy_canonical: bool = False,
    copy_model_family: bool = True,
    enable_ts_for_k: set[int] | None = None,
    ts_n: int = 10,
    ts_prefix: str = "classicmodels_ts",
    ts_max_rows: int = 500,
    max_new_tokens: int = 128,
):
    """
    Design rationale (QLoRA grid):
    1) Reuse the same `eval_run` scoring path as baseline.
       Alternative considered: custom QLoRA evaluator.
       Chosen approach: identical scoring code keeps method comparison fair.

    2) Keep model + adapter identity in metadata (`model_id`, `model_alias`, `adapter_dir`).
       This allows exact provenance for each JSON artifact.

    3) Validate required globals early (dataset, schema, engine, tokenizer, DB creds).
       This avoids wasting long GPU runs due to missing setup state.

    4) Keep prompt/schema/exemplar ablation knobs aligned with baseline.
       This isolates the effect of tuning from prompt-policy drift.

    5) Use deterministic state cleanup (`try/finally`) for prompt overrides and connectors.
       Notebook reruns remain safe after interrupts or runtime errors.

    6) Write both per-run and aggregated outputs for two audiences:
       debugging/audit trails and final reporting tables.
    """
    # Basic guardrails for reproducible runs.
    if not seeds:
        raise ValueError("Provide at least one seed")
    if prompt_variant not in PROMPT_VARIANTS:
        raise ValueError(f"Unknown PROMPT_VARIANT: {prompt_variant}")

    # Depends on setup cells that define data, model, tokenizer, and DB config.
    required = ["test_set", "SCHEMA_SUMMARY", "engine", "tok", "INSTANCE_CONNECTION_NAME", "DB_USER", "DB_PASS"]
    missing = [k for k in required if k not in globals()]
    if missing:
        raise RuntimeError(f"Missing globals from previous cells: {missing}")

    try:
        commit = subprocess.check_output(["git", "rev-parse", "--short", "HEAD"]).decode().strip()
    except Exception:
        commit = "unknown"

    run_metadata_base = {
        "commit": commit,
        "model_id": model_id,
        "model_alias": model_alias,
        "method": "qlora",
        "adapter_dir": str(adapter_dir),
    }

    # Run directory is timestamped for traceability and collision safety.
    ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%SZ")
    run_dir = Path("results/qlora/runs") / f"{run_tag}_{ts}"
    run_dir.mkdir(parents=True, exist_ok=True)

    schema_used = schema_variant_text(SCHEMA_SUMMARY, schema_variant)
    exemplar_pool = exemplar_pool_for_strategy(test_set, exemplar_strategy)

    ts_enabled_k = set(enable_ts_for_k or set())
    ts_suite_db_names = (
        [f"{ts_prefix}_{i:02d}" for i in range(1, ts_n + 1)]
        if ts_enabled_k and ts_n > 0
        else None
    )

    # Lazy connector cache for optional TS evaluation databases.
    ts_connectors = {}

    @lru_cache(maxsize=32)
    def _make_engine_cached(db_name: str):
        eng, conn = create_engine_with_connector(
            instance_connection_name=INSTANCE_CONNECTION_NAME,
            user=DB_USER,
            password=DB_PASS,
            db_name=db_name,
        )
        ts_connectors[db_name] = conn
        return eng

    rows = []
    primary_seed = seeds[0]  # first seed is used for canonical/model-family copies

    # Prompt override is temporary and restored in finally.
    old_prompt = prompting_mod.SYSTEM_INSTRUCTIONS
    prompting_mod.SYSTEM_INSTRUCTIONS = PROMPT_VARIANTS[prompt_variant]

    try:
        for k in k_values:
            # Use all seeds for every k for clean repeated-run stats.
            for seed in seeds:
                save_path = run_dir / f"results_k{k}_seed{seed}.json"

                run_meta = dict(run_metadata_base)
                run_meta.update({
                    "run_tag": run_tag,
                    "k": k,
                    "seed": seed,
                    "prompt_variant": prompt_variant,
                    "schema_variant": schema_variant,
                    "exemplar_strategy": exemplar_strategy,
                    "exemplar_pool_size": len(exemplar_pool),
                    "ts_enabled": bool(k in ts_enabled_k),
                    "ts_for_k_values": sorted(ts_enabled_k),
                    "ts_n": ts_n if ts_suite_db_names else 0,
                })

                items = eval_run(
                    test_set=test_set,
                    exemplar_pool=exemplar_pool,
                    k=k,
                    limit=limit,
                    seed=seed,
                    engine=engine,
                    model=eval_model,
                    tokenizer=tok,
                    schema_summary=schema_used,
                    save_path=str(save_path),
                    run_metadata=run_meta,
                    ts_suite_db_names=ts_suite_db_names if k in ts_enabled_k else None,
                    ts_make_engine_fn=_make_engine_cached if k in ts_enabled_k else None,
                    ts_max_rows=ts_max_rows,
                    avoid_exemplar_leakage=True,
                    max_new_tokens=max_new_tokens,
                )

                # Aggregate per-run rates for quick table summaries.
                n = len(items)
                va = sum(int(x.va) for x in items) / max(n, 1)
                em = sum(int(x.em) for x in items) / max(n, 1)
                ex = sum(int(x.ex) for x in items) / max(n, 1)
                ts_values = [int(x.ts) for x in items if getattr(x, "ts", None) is not None]
                ts_rate = (sum(ts_values) / len(ts_values)) if ts_values else None

                rows.append({
                    "run_tag": run_tag,
                    "prompt_variant": prompt_variant,
                    "schema_variant": schema_variant,
                    "exemplar_strategy": exemplar_strategy,
                    "exemplar_pool_size": len(exemplar_pool),
                    "k": k,
                    "seed": seed,
                    "n": n,
                    "va_rate": va,
                    "em_rate": em,
                    "ex_rate": ex,
                    "ts_rate": ts_rate,
                    "ts_n": len(ts_values),
                    "json_path": str(save_path),
                })

                if seed == primary_seed and copy_canonical and k in {0, 3}:
                    target = (
                        Path("results/qlora/results_zero_shot_200.json")
                        if k == 0 else Path("results/qlora/results_few_shot_k3_200.json")
                    )
                    target.parent.mkdir(parents=True, exist_ok=True)
                    shutil.copy2(save_path, target)
                    print(f"Updated canonical file: {target}")

                if seed == primary_seed and copy_model_family:
                    model_target = Path("results/qlora/model_family") / f"{model_alias}_qlora_k{k}.json"
                    model_target.parent.mkdir(parents=True, exist_ok=True)
                    shutil.copy2(save_path, model_target)
                    print(f"Updated model-family file: {model_target}")
    finally:
        for conn in ts_connectors.values():
            try:
                conn.close()
            except Exception:
                pass
        prompting_mod.SYSTEM_INSTRUCTIONS = old_prompt

    # Save per-run table and per-k mean/std summary.
    df = pd.DataFrame(rows).sort_values(["k", "seed"]).reset_index(drop=True)
    df.to_csv(run_dir / "grid_summary.csv", index=False)

    agg = (
        df.groupby(["prompt_variant", "schema_variant", "exemplar_strategy", "k"], as_index=False)
        .agg(
            runs=("seed", "count"),
            va_mean=("va_rate", "mean"),
            va_std=("va_rate", "std"),
            em_mean=("em_rate", "mean"),
            em_std=("em_rate", "std"),
            ex_mean=("ex_rate", "mean"),
            ex_std=("ex_rate", "std"),
            ts_mean=("ts_rate", "mean"),
            ts_std=("ts_rate", "std"),
        )
    )
    agg.to_csv(run_dir / "grid_summary_by_k.csv", index=False)

    print("Saved grid run to:", run_dir)
    return df, agg, run_dir


# Optional Colab helper: copy completed run artifacts to Drive.


## 9) Load Adapter


In [None]:
import gc
import json
import re
from pathlib import Path

import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM

# Pin eval target explicitly
# this cell is eval-only; it does not retrain adapters.
# we point to a local adapter path so loading is deterministic in colab.
# provenance: explicit model+adapter pinning avoids accidental cross-model adapter loads.
EVAL_MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
EVAL_ADAPTER_DIR = "/content/NLtoSQL/results/adapters/qlora_classicmodels"


def resolve_adapter_dir(path_str: str) -> Path:
    p = Path(path_str).expanduser()
    if not p.is_absolute():
        p = (Path.cwd() / p).resolve()

    if (p / "adapter_config.json").exists():
        return p

    ckpts = sorted(
        [x for x in p.glob("checkpoint-*") if (x / "adapter_config.json").exists()],
        key=lambda x: int(re.findall(r"\d+", x.name)[-1]),
    )
    if ckpts:
        return ckpts[-1]

    raise FileNotFoundError(f"No adapter_config.json found under: {p}")


MODEL_ID = EVAL_MODEL_ID
MODEL_ALIAS = _model_alias_from_id(MODEL_ID)
adapter_path = resolve_adapter_dir(EVAL_ADAPTER_DIR)
ADAPTER_DIR = str(adapter_path)

print("EVAL MODEL_ID:", MODEL_ID)
print("Resolved ADAPTER_DIR:", ADAPTER_DIR)

cfg = json.loads((adapter_path / "adapter_config.json").read_text())
print("adapter base_model_name_or_path:", cfg.get("base_model_name_or_path"))

for name in ["eval_model", "eval_base", "model", "base_model"]:
    if name in globals():
        del globals()[name]

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

EVAL_GPU_MEM_GIB = 10
EVAL_CPU_MEM_GIB = 64

Path("/content/offload").mkdir(parents=True, exist_ok=True)

eval_base = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=compute_dtype if "compute_dtype" in globals() else torch.float16,
    low_cpu_mem_usage=True,
    max_memory={0: f"{EVAL_GPU_MEM_GIB}GiB", "cpu": f"{EVAL_CPU_MEM_GIB}GiB"},
    offload_folder="/content/offload",
    token=True,
)

eval_model = PeftModel.from_pretrained(
    eval_base,
    ADAPTER_DIR,              # local resolved path
    is_trainable=False,
    device_map="auto",
    local_files_only=True,    # force local, no HF lookup
)

print("Loaded base + adapter from disk")

if torch.cuda.is_available():
    free_b, total_b = torch.cuda.mem_get_info()
    print("CUDA free/total GiB:", round(free_b / (1024**3), 2), "/", round(total_b / (1024**3), 2))

eval_model.eval()
if hasattr(eval_model, "gradient_checkpointing_disable"):
    eval_model.gradient_checkpointing_disable()
if hasattr(eval_model.config, "use_cache"):
    eval_model.config.use_cache = True
if hasattr(eval_model, "generation_config"):
    eval_model.generation_config.do_sample = False
    eval_model.generation_config.temperature = 1.0
    eval_model.generation_config.top_p = 1.0
    eval_model.generation_config.top_k = 50


## 10) Eval Plan
Use the next cell to configure the evaluation sweep and preview the data flow:
1. Resolve `RUN_PLAN` into concrete `k`/seed settings.
2. Verify adapter, prompt, schema, and exemplar-pool choices.
3. Confirm TS settings before launching the grid run.


In [None]:
# ============================
# Evaluation plan selector
# ============================
# Edit this block for experiment selection.
RUN_PLAN = "quick"  # quick | k5_core | seed_backfill | ts_k3 | custom
# run_plan resolves to one concrete k/seed grid.
# use custom for one exact run when recovering from disconnects.
RUN_TAG_BASE = f"{MODEL_ALIAS}_qlora"

PROMPT_VARIANT = "default"
SCHEMA_VARIANT = "full"
EXEMPLAR_STRATEGY = "all"
# keep these ablation toggles fixed when comparing models.
# provenance: k/seeds come from the project eval protocol (repeatability over one-off scores).
# use quick only for smoke checks; use custom for targeted backfills after disconnects.

COPY_MODEL_FAMILY = True
COPY_CANONICAL = False

TS_N = 10
TS_PREFIX = "classicmodels_ts"
TS_MAX_ROWS = 500
MAX_NEW_TOKENS = 128

CUSTOM_PLAN = {
    "k_values": [0, 3],
    "seeds": [7],
    "run_tag": f"{RUN_TAG_BASE}_custom",
    "enable_ts": False,
}

if RUN_PLAN == "quick":
    K_VALUES = [0, 3]
    SEEDS = [7]
    RUN_TAG = f"{RUN_TAG_BASE}_main"
    ENABLE_TS = False
elif RUN_PLAN == "k5_core":
    K_VALUES = [5]
    SEEDS = [7]
    RUN_TAG = f"{RUN_TAG_BASE}_k5_core"
    ENABLE_TS = False
elif RUN_PLAN == "seed_backfill":
    K_VALUES = [0, 3, 5]
    SEEDS = [17, 27]
    RUN_TAG = f"{RUN_TAG_BASE}_seed_backfill"
    ENABLE_TS = False
elif RUN_PLAN == "ts_k3":
    K_VALUES = [3]
    SEEDS = [7]
    RUN_TAG = f"{RUN_TAG_BASE}_ts_k3"
    ENABLE_TS = True
elif RUN_PLAN == "custom":
    K_VALUES = list(CUSTOM_PLAN["k_values"])
    SEEDS = list(CUSTOM_PLAN["seeds"])
    RUN_TAG = str(CUSTOM_PLAN["run_tag"])
    ENABLE_TS = bool(CUSTOM_PLAN["enable_ts"])
else:
    raise ValueError(f"Unknown RUN_PLAN: {RUN_PLAN}")

TS_FOR_K_VALUES = [3]

print("Evaluation run card:")
print({
    "run_plan": RUN_PLAN,
    "run_tag": RUN_TAG,
    "model_id": MODEL_ID,
    "model_alias": MODEL_ALIAS,
    "adapter_dir": ADAPTER_DIR,
    "k_values": K_VALUES,
    "seeds": SEEDS,
    "prompt_variant": PROMPT_VARIANT,
    "schema_variant": SCHEMA_VARIANT,
    "exemplar_strategy": EXEMPLAR_STRATEGY,
    "enable_ts": ENABLE_TS,
    "max_new_tokens": MAX_NEW_TOKENS,
})

preview_schema = schema_variant_text(SCHEMA_SUMMARY, SCHEMA_VARIANT)
preview_pool = exemplar_pool_for_strategy(test_set, EXEMPLAR_STRATEGY)
print("Narrative preview:")
print({
    "adapter_exists": Path(ADAPTER_DIR).exists(),
    "schema_lines": len(preview_schema.splitlines()),
    "exemplar_pool_size": len(preview_pool),
    "grid_size": len(K_VALUES) * len(SEEDS),
})


In [None]:
# Demo: End-to-end NLQ -> faulty SQL -> cleaned SQL (single cell)
from IPython.display import display, HTML
import pandas as pd
import html

from nl2sql.core.prompting import make_few_shot_messages
from nl2sql.agent.constraint_policy import build_constraints
from nl2sql.core.llm import debug_extract_first_select
from nl2sql.core.postprocess import debug_guarded_postprocess


# Small helper to print section headers in the notebook output.
def show_title(text):
    display(HTML(f"<h3 style='margin:12px 0 6px 0'>{html.escape(text)}</h3>"))


# Small helper to render SQL/text in a boxed monospace block.
def show_pre(text, label=None):
    label_html = f"<div style='font-weight:600;margin-bottom:6px'>{html.escape(label)}</div>" if label else ""
    display(HTML(
        "<div style='border:1px solid #ddd;border-radius:8px;padding:10px 12px;margin:8px 0'>"
        f"{label_html}"
        f"<pre style='white-space:pre-wrap;margin:0;font-family:ui-monospace, SFMono-Regular, Menlo, Consolas, monospace'>{html.escape(str(text))}</pre>"
        "</div>"
    ))


# Convert postprocess steps into a small readable table.
def steps_df(pp):
    return pd.DataFrame([
        {
            "changed": "yes" if s["changed"] else "no",
            "stage": s["stage"],
            "note": s.get("note", ""),
        }
        for s in pp["steps"]
    ])


# Demo NLQs: one implicit question and one explicit field-list question.
DEMO_NLQ_IMPLICIT = "List all customer names in France"
DEMO_NLQ_EXPLICIT = "List contact last name, customer name, and customer number for customers in France"

# Use real schema text if available; otherwise use a minimal fallback.
schema_text = (
    SCHEMA_SUMMARY
    if "SCHEMA_SUMMARY" in globals() and isinstance(SCHEMA_SUMMARY, str) and SCHEMA_SUMMARY.strip()
    else "Table customers (customerNumber INT, customerName TEXT, contactLastName TEXT, country TEXT, creditLimit REAL)"
)

# Pull a couple of real exemplars when the benchmark is loaded.
exemplars = []
if "test_set" in globals() and isinstance(test_set, list):
    exemplars = [x for x in test_set[:2] if isinstance(x, dict) and "nlq" in x and "sql" in x]

# Build the same style of messages used by the real pipeline.
messages = make_few_shot_messages(schema=schema_text, exemplars=exemplars, nlq=DEMO_NLQ_IMPLICIT)
constraints_implicit = build_constraints(DEMO_NLQ_IMPLICIT, schema_text)

# Step 1: show the NLQ and the prompt context.
show_title("Step 1 - NLQ and prompt context")
display(pd.DataFrame([
    {
        "nlq": DEMO_NLQ_IMPLICIT,
        "schema_lines": len(schema_text.splitlines()),
        "exemplars_used": len(exemplars),
        "message_count": len(messages),
        "explicit_fields": constraints_implicit.get("explicit_fields"),
    }
]))

display(pd.DataFrame([
    {
        "role": m.get("role"),
        "content_preview": str(m.get("content", "")).replace("\n", " ")[:140],
    }
    for m in messages[:6]
]))

# Step 2: simulate a noisy/faulty model output (on purpose).
FAULTY_TEXT = """Model draft + noise:
select from the options above

SQL:
SELECT c.customerNumber, c.customerName, c.contactLastName, c.creditLimit
FROM customers c
WHERE c.country = 'France'
ORDER BY c.customerName DESC
LIMIT 5;

Extra explanation after SQL.
"""

show_title("Step 2 - Simulated faulty SQL draft")
show_pre(FAULTY_TEXT, "Faulty model output (simulated)")

# Step 3: run extraction logic to pick the best SQL candidate.
show_title("Step 3 - Extraction debug")
extract_debug = debug_extract_first_select(FAULTY_TEXT)
selected_sql = extract_debug.get("selected_sql") or FAULTY_TEXT

display(pd.DataFrame([
    {
        "candidate": i,
        "accepted": c.get("accepted"),
        "reject_reason": c.get("reject_reason"),
        "from_target": c.get("from_target"),
        "candidate_sql": c.get("candidate_sql"),
    }
    for i, c in enumerate(extract_debug.get("candidates", []), start=1)
]))
show_pre(selected_sql, "Selected SQL candidate")

# Step 4A: clean SQL for implicit-field question behavior.
show_title("Step 4A - Cleaning trace (implicit fields)")
pp_a = debug_guarded_postprocess(
    selected_sql,
    DEMO_NLQ_IMPLICIT,
    explicit_fields=constraints_implicit.get("explicit_fields") if constraints_implicit.get("explicit_projection") else None,
    required_fields=constraints_implicit.get("required_output_fields"),
)
display(steps_df(pp_a))
show_pre(pp_a["final_sql"], "Final cleaned SQL (implicit)")

# Step 4B: clean SQL for explicit-field question behavior.
show_title("Step 4B - Cleaning trace (explicit fields)")
pp_b = debug_guarded_postprocess(
    selected_sql,
    DEMO_NLQ_EXPLICIT,
    explicit_fields=["contactLastName", "customerName", "customerNumber"],
)
display(steps_df(pp_b))
show_pre(pp_b["final_sql"], "Final cleaned SQL (explicit)")


## 11) Run Plan
Run this after the preview above is correct. The execution cell writes per-run JSON files plus grid summary CSVs.


In [None]:
# Execute selected sweep.
qlora_grid, qlora_by_k, qlora_run_dir = run_qlora_grid(
    model_id=MODEL_ID,
    model_alias=MODEL_ALIAS,
    adapter_dir=ADAPTER_DIR,
    eval_model=eval_model,
    k_values=K_VALUES,
    seeds=SEEDS,
    run_tag=RUN_TAG,
    prompt_variant=PROMPT_VARIANT,
    schema_variant=SCHEMA_VARIANT,
    exemplar_strategy=EXEMPLAR_STRATEGY,
    limit=None,
    copy_canonical=COPY_CANONICAL,
    copy_model_family=COPY_MODEL_FAMILY,
    enable_ts_for_k=set(TS_FOR_K_VALUES) if ENABLE_TS else None,
    ts_n=TS_N,
    ts_prefix=TS_PREFIX,
    ts_max_rows=TS_MAX_ROWS,
    max_new_tokens=MAX_NEW_TOKENS,
)

print("Saved run dir:", qlora_run_dir)
display(qlora_grid)
display(qlora_by_k)


In [None]:
import json
from pathlib import Path

baseline_zero = Path("results/baseline/results_zero_shot_200.json")
baseline_few  = Path("results/baseline/results_few_shot_k3_200.json")

qlora_zero = Path("results/qlora/results_zero_shot_200.json")
qlora_few = Path("results/qlora/results_few_shot_k3_200.json")

# If canonical files are disabled, fall back to current run outputs.
if "qlora_run_dir" in globals():
    qdir = Path(qlora_run_dir)
    q0_run = qdir / "results_k0_seed7.json"
    q3_run = qdir / "results_k3_seed7.json"
    if q0_run.exists():
        qlora_zero = q0_run
    if q3_run.exists():
        qlora_few = q3_run

if baseline_zero.exists() and baseline_few.exists() and qlora_zero.exists() and qlora_few.exists():
    b0 = json.loads(baseline_zero.read_text(encoding="utf-8"))
    b3 = json.loads(baseline_few.read_text(encoding="utf-8"))
    q0 = json.loads(qlora_zero.read_text(encoding="utf-8"))
    q3 = json.loads(qlora_few.read_text(encoding="utf-8"))

    print("Baseline zero-shot:", "VA", round(b0["va_rate"], 3), "EM", round(b0.get("em_rate", 0.0), 3), "EX", round(b0["ex_rate"], 3))
    print("QLoRA   zero-shot:", "VA", round(q0["va_rate"], 3), "EM", round(q0.get("em_rate", 0.0), 3), "EX", round(q0["ex_rate"], 3))
    print("Baseline few-shot :", "VA", round(b3["va_rate"], 3), "EM", round(b3.get("em_rate", 0.0), 3), "EX", round(b3["ex_rate"], 3))
    print("QLoRA   few-shot  :", "VA", round(q3["va_rate"], 3), "EM", round(q3.get("em_rate", 0.0), 3), "EX", round(q3["ex_rate"], 3))
else:
    print("Missing files for comparison. Run baseline notebook and this QLoRA quick cell first.")
