# ReAct Eval


## Summary
- Purpose: evaluate the tool-driven ReAct pipeline with traceable reasoning steps.
- Scope: supports both interactive single-question demos and full benchmark execution.
- Outputs: agent run artifacts and failure profiles under `results/agent/`.


## 0) Install


In [None]:

%%bash
set -e
export PIP_DEFAULT_TIMEOUT=120

# Clean conflicting preinstalls
pip uninstall -y torch torchvision torchaudio bitsandbytes triton transformers accelerate peft trl datasets numpy pandas fsspec requests google-auth || true

# Base deps
pip install -q --no-cache-dir --force-reinstall   numpy==1.26.4 pandas==2.2.1 fsspec==2024.5.0 requests==2.31.0 google-auth==2.43.0

# Torch + CUDA 12.1
pip install -q --no-cache-dir --force-reinstall   torch==2.3.1+cu121 torchvision==0.18.1+cu121 torchaudio==2.3.1+cu121   --index-url https://download.pytorch.org/whl/cu121

# bitsandbytes + triton + HF stack
pip install -q --no-cache-dir --force-reinstall   bitsandbytes==0.43.3 triton==2.3.1   transformers==4.44.2 accelerate==0.33.0 peft==0.17.0 trl==0.9.6 datasets==2.20.0

echo "Setup complete. Restart runtime once, then run the rest of the notebook top-to-bottom."


## 1) Sync Repo


In [None]:
# 0) Clone repo (Colab) + install deps
import os
try:
    import google.colab  # noqa: F401
    IN_COLAB = True
except Exception:
    IN_COLAB = False

if IN_COLAB:
    if not os.path.exists('/content/NLtoSQL'):
        !git clone https://github.com/MacKenzieOBrian/NLtoSQL.git /content/NLtoSQL
    %cd /content/NLtoSQL
    !pip -q install -r requirements.txt
    import torch, transformers, accelerate, peft
    print('torch', torch.__version__, 'cuda', torch.cuda.is_available())
else:
    print('Not in Colab; using existing workspace')


## 2) Auth


In [None]:
# Run this only if you prefer gcloud-based ADC (no JSON key)
try:
    import google.colab  # noqa: F401
    IN_COLAB = True
except Exception:
    IN_COLAB = False

if IN_COLAB:
    %pip install -q --upgrade google-auth google-auth-oauthlib
    !gcloud auth application-default login
else:
    print("Not in Colab; skip gcloud auth.")


## 3) DB Engine


In [None]:
# 1) Environment + DB
import os
from getpass import getpass

from sqlalchemy import text

from nl2sql.db import create_engine_with_connector, safe_connection

# Expected env vars (set these in a Colab cell):
# INSTANCE_CONNECTION_NAME, DB_USER, DB_PASS, DB_NAME
INSTANCE_CONNECTION_NAME = os.getenv("INSTANCE_CONNECTION_NAME")
DB_USER = os.getenv("DB_USER")
DB_PASS = os.getenv("DB_PASS")
DB_NAME = os.getenv("DB_NAME") or "classicmodels"

if not INSTANCE_CONNECTION_NAME:
    INSTANCE_CONNECTION_NAME = input("Enter INSTANCE_CONNECTION_NAME: ").strip()
if not DB_USER:
    DB_USER = input("Enter DB_USER: ").strip()
if not DB_PASS:
    DB_PASS = getpass("Enter DB_PASS: ")

# Canonical engine builder (shared with scripts + other notebooks).
# Uses Cloud SQL Connector under the hood and ADC for credentials.
engine, connector = create_engine_with_connector(
    instance_connection_name=INSTANCE_CONNECTION_NAME,
    user=DB_USER,
    password=DB_PASS,
    db_name=DB_NAME,
)

with safe_connection(engine) as conn:
    conn.execute(text("SELECT 1"))
print("DB connection OK")


## 4) TS Engines


In [None]:
# 1b) Engine factory for TS (multiple DB names)

import sqlalchemy
from sqlalchemy.engine import Engine


def make_engine(db_name: str) -> Engine:
    """Create a new engine bound to a specific TS replica DB name.

    TS (test-suite accuracy) executes the same (gold, pred) SQL across multiple
    replica databases (classicmodels_ts_XX). We keep separate engines so each
    replica is evaluated independently.
    """

    def getconn_for_db():
        return connector.connect(
            INSTANCE_CONNECTION_NAME,
            "pymysql",
            user=DB_USER,
            password=DB_PASS,
            db=db_name,
        )

    return sqlalchemy.create_engine("mysql+pymysql://", creator=getconn_for_db, future=True)


## 5) Schema + Dataset


In [None]:
# 2) Load schema summary + test set + QueryRunner
import json
from pathlib import Path
from nl2sql.schema import build_schema_summary
from nl2sql.query_runner import QueryRunner

SCHEMA_SUMMARY = build_schema_summary(engine, db_name=DB_NAME)
print("Schema contains offices.city:", "offices" in SCHEMA_SUMMARY.lower() and "city" in SCHEMA_SUMMARY.lower())

test_path = Path("data/classicmodels_test_200.json")
full_set = json.loads(test_path.read_text(encoding="utf-8"))

# Default target is full benchmark evaluation (200 items).
test_set = full_set
print("Loaded test items:", len(test_set))

# Runner is used for local VA checks in demo sanity cells.
runner = QueryRunner(engine)


## 6) Load Model


In [None]:

# 3) Load model (base or QLoRA adapters)
import os
from getpass import getpass
from pathlib import Path
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
ADAPTER_PATH = os.getenv("ADAPTER_PATH") or "results/adapters/qlora_classicmodels"  # set to None to use base model

# Experiment knobs (change one axis at a time for comparable claims):
# - MODEL_ID: switch model family (Llama/Qwen/etc).
# - ADAPTER_PATH: set to local QLoRA adapter dir for tuned runs; set to None for base-model runs.
# Keep all later loop/TS settings unchanged when isolating model effects.

HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_TOKEN")
if not HF_TOKEN:
    HF_TOKEN = getpass("Enter HF_TOKEN (https://huggingface.co/settings/tokens): ").strip()

cc_major, cc_minor = torch.cuda.get_device_capability(0) if torch.cuda.is_available() else (0, 0)
use_bf16 = cc_major >= 8
compute_dtype = torch.bfloat16 if use_bf16 else torch.float16
print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
print("Using bf16:", use_bf16)
print("Adapter path:", ADAPTER_PATH)

# Tokenizer
tok = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token

# Quantized base model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
)

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    torch_dtype=compute_dtype,
    device_map={"": 0} if torch.cuda.is_available() else None,
    token=HF_TOKEN,
)
base_model.generation_config.do_sample = False
base_model.generation_config.temperature = 1.0
base_model.generation_config.top_p = 1.0

# Load adapters if present locally; otherwise use base model
adapter_dir = Path(ADAPTER_PATH) if ADAPTER_PATH else None
if adapter_dir and adapter_dir.exists():
    model = PeftModel.from_pretrained(base_model, adapter_dir, token=HF_TOKEN)
    print("Loaded adapters from", adapter_dir)
else:
    print("Adapter path missing; using base model only. Set ADAPTER_PATH to your local adapter folder or upload it to Colab.")
    model = base_model


## 7) Pipeline Config


In [None]:
# --- ReAct module imports, context binding, and config ---
from nl2sql.agent_tools import AgentContext, set_agent_context
from nl2sql.react_pipeline import ReactAblationConfig, run_react_pipeline, evaluate_react_ablation
from nl2sql.prompts import REACT_SYSTEM_PROMPT

# Bind shared runtime context once.
set_agent_context(
    AgentContext(
        engine=engine,
        db_name=DB_NAME,
        model=model,
        tok=tok,
        runner=runner,
        max_new_tokens=128,
    )
)

# ReAct config knobs (edit for controlled ablations).
REACT_CONFIG_NAME = "react_core_notebook"
REACT_USE_SCHEMA_LINK = True
REACT_USE_CONSTRAINT_POLICY = True
REACT_USE_REPAIR_POLICY = True
REACT_USE_INTENT_GATE = False
REACT_MAX_REPAIRS = 1
REACT_LINK_MAX_TABLES = 6
REACT_MAX_STEPS = 8
REACT_MAX_NEW_TOKENS = 256
REACT_DO_SAMPLE = False
REACT_TEMPERATURE = 0.2
REACT_TOP_P = 0.9

REACT_CONFIG = ReactAblationConfig(
    name=REACT_CONFIG_NAME,
    use_schema_link=REACT_USE_SCHEMA_LINK,
    use_constraint_policy=REACT_USE_CONSTRAINT_POLICY,
    use_repair_policy=REACT_USE_REPAIR_POLICY,
    use_intent_gate=REACT_USE_INTENT_GATE,
    max_repairs=REACT_MAX_REPAIRS,
    link_max_tables=REACT_LINK_MAX_TABLES,
    max_steps=REACT_MAX_STEPS,
    max_new_tokens=REACT_MAX_NEW_TOKENS,
    do_sample=REACT_DO_SAMPLE,
    temperature=REACT_TEMPERATURE,
    top_p=REACT_TOP_P,
)


def summarize_trace_brief(trace: list[dict]) -> dict:
    # Small notebook helper for readable demo output.
    actions = [t.get("action") for t in trace if t.get("action")]
    blocked_steps = sum(1 for t in trace if t.get("blocked"))
    stop_reason = next((t.get("reason") for t in trace if t.get("action") == "stop"), None)
    return {
        "steps": len(trace),
        "actions": actions,
        "blocked_steps": blocked_steps,
        "stop_reason": stop_reason,
    }

print("Using pipeline module:", run_react_pipeline.__module__)
print("ReAct config:", REACT_CONFIG)


## 9) Interactive Demo


In [None]:
# 12A) Interactive walkthrough (single NLQ trace)
DEMO_INTERACTIVE = True
DEMO_DEFAULT_NLQ = "Which customers are in France?"

nlq = ""
if DEMO_INTERACTIVE:
    try:
        nlq = input("Type a ClassicModels question (blank uses default): ").strip()
    except Exception:
        nlq = ""
if not nlq:
    nlq = DEMO_DEFAULT_NLQ

pred_sql, trace = run_react_pipeline(nlq=nlq, config=REACT_CONFIG)
summary = summarize_trace_brief(trace)

print()
print("FINAL SQL:")
print(pred_sql or "(no prediction)")
print()
print("TRACE SUMMARY:")
print(summary)
if trace:
    print()
    print("LAST TRACE ENTRY:")
    print(trace[-1])


## 10) Sanity Check


In [None]:
# 12B) Quick sanity check on a small slice
from nl2sql.eval import execution_accuracy

DEBUG_EX = False  # set True for a quick EX check (slower)
DEBUG_TRACE = True

for sample in test_set[:5]:
    nlq = sample["nlq"]
    gold = sample["sql"]
    pred, trace = run_react_pipeline(nlq=nlq, config=REACT_CONFIG)

    print("NLQ:", nlq)
    print("PRED:", pred)
    print("GOLD:", gold)

    if pred:
        meta = runner.run(pred, capture_df=False)
        print("VA:", int(meta.success), "ERR:", meta.error)
        if DEBUG_EX:
            ex_ok, pred_err, gold_err = execution_accuracy(engine=engine, pred_sql=pred, gold_sql=gold)
            print("EX:", int(ex_ok), "PRED_ERR:", pred_err, "GOLD_ERR:", gold_err)
    else:
        print("VA:", 0, "ERR:", "no_prediction")

    if DEBUG_TRACE and trace:
        print("TRACE:", summarize_trace_brief(trace))
        print("TRACE LAST:", trace[-1])
    else:
        print("TRACE LEN:", len(trace))
    print("-" * 80)


## 11) Run Controls
This cell sets full-vs-quick mode and TS settings, then offers an optional single-item trace preview.
Use it to explain how `run_react_pipeline` behavior maps into final batch evaluation.


In [None]:
# === Run controls ===
# Default is full benchmark reporting.
QUICK_LIMIT = None   # set to 20 for quick checks, None for full 200
TS_N = 10            # set to 3 for faster debug, 10 for full TS
MAX_ROWS_TS = 500
TS_PREFIX = "classicmodels_ts"

REACT_CONFIG = ReactAblationConfig(
    name=REACT_CONFIG_NAME,
    use_schema_link=REACT_USE_SCHEMA_LINK,
    use_constraint_policy=REACT_USE_CONSTRAINT_POLICY,
    use_repair_policy=REACT_USE_REPAIR_POLICY,
    use_intent_gate=REACT_USE_INTENT_GATE,
    max_repairs=REACT_MAX_REPAIRS,
    link_max_tables=REACT_LINK_MAX_TABLES,
    max_steps=REACT_MAX_STEPS,
    max_new_tokens=REACT_MAX_NEW_TOKENS,
    do_sample=REACT_DO_SAMPLE,
    temperature=REACT_TEMPERATURE,
    top_p=REACT_TOP_P,
)

print("Active config:", REACT_CONFIG)
print("Run mode:", "full_200" if QUICK_LIMIT is None else f"quick_{QUICK_LIMIT}")
print("TS replicas:", TS_N)


In [None]:
# Demo 1: one-item ReAct trace preview
TRACE_PREVIEW_INDEX = 0

if not test_set:
    print("test_set is empty; cannot run trace preview.")
else:
    idx = max(0, min(TRACE_PREVIEW_INDEX, len(test_set) - 1))
    preview_item = test_set[idx]
    preview_pred, preview_trace = run_react_pipeline(nlq=preview_item["nlq"], config=REACT_CONFIG)

    print("Trace preview NLQ:", preview_item["nlq"])
    print("Pred SQL:", preview_pred or "(no prediction)")
    print("Trace steps:", len(preview_trace))
    if preview_trace:
        print("First step:", preview_trace[0])
        print("Last step:", preview_trace[-1])


In [None]:
# Demo 2: full single-item ReAct debug report (action-by-action + VA/EM/EX)
from nl2sql.agent.react_pipeline import debug_react_single_item

REACT_DEBUG_INDEX = 0

if not test_set:
    print("test_set is empty; cannot run single-item ReAct debug.")
else:
    idx = max(0, min(REACT_DEBUG_INDEX, len(test_set) - 1))
    react_item = test_set[idx]
    react_debug = debug_react_single_item(
        item=react_item,
        engine=engine,
        config=REACT_CONFIG,
        allow_extra_columns_ex=False,
    )

    print("ReAct single-item debug summary:")
    print({
        "nlq": react_debug["nlq"],
        "pred_sql": react_debug["pred_sql"],
        "va": react_debug["va"],
        "em": react_debug["em"],
        "ex": react_debug["ex"],
        "trace_len": react_debug["trace_len"],
        "action_counts": react_debug["action_counts"],
        "error": react_debug["error"],
    })

    print("\nTrace walkthrough:")
    for step in react_debug["trace"]:
        action = step.get("action")
        obs = step.get("observation")
        blocked = step.get("blocked")
        print(f"step={step.get('step')} action={action} blocked={blocked}")
        if isinstance(obs, dict):
            print(" observation keys:", sorted(obs.keys()))
        else:
            print(" observation:", obs)


## 12) Eval Setup


In [None]:
# --- Full-eval helpers and setup ---
import json
import shutil
import subprocess
from datetime import datetime, timezone
from functools import lru_cache
from pathlib import Path
from sqlalchemy.engine import Engine

# Run metadata for reproducibility logs.
try:
    commit = subprocess.check_output(["git", "rev-parse", "--short", "HEAD"]).decode().strip()
except Exception:
    commit = "unknown"

RUN_TAG = f"react_{REACT_CONFIG.name}"
RUN_TS = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%SZ")
RUN_DIR = Path("results/agent/runs") / f"{RUN_TAG}_{RUN_TS}"
RUN_DIR.mkdir(parents=True, exist_ok=True)

SUITE_DBS = [f"{TS_PREFIX}_{i:02d}" for i in range(1, TS_N + 1)] if TS_N and TS_N > 0 else []

@lru_cache(maxsize=32)
def make_engine_cached(db_name: str) -> Engine:
    return make_engine(db_name)

def make_engine_fn(db_name: str) -> Engine:
    return make_engine_cached(db_name)


## 13) Run Full Eval
This is the long-running evidence cell: it executes `evaluate_react_ablation`, saves JSON, and updates canonical artifacts for downstream comparison.


In [None]:
# --- Execute full evaluation loop ---
out_path = RUN_DIR / "results_react_eval.json"

run_metadata = {
    "commit": commit,
    "notebook": "03_agentic_eval.ipynb",
    "model_id": MODEL_ID,
    "adapter_path": ADAPTER_PATH,
    "config_name": REACT_CONFIG.name,
    "quick_limit": QUICK_LIMIT,
    "ts_n": TS_N,
}

report = evaluate_react_ablation(
    test_set=test_set,
    engine=engine,
    config=REACT_CONFIG,
    limit=QUICK_LIMIT,
    ts_suite_db_names=SUITE_DBS if SUITE_DBS else None,
    ts_make_engine_fn=make_engine_fn if SUITE_DBS else None,
    ts_max_rows=MAX_ROWS_TS,
    progress_every=20,
    run_metadata=run_metadata,
    save_path=out_path,
)

results = report.get("items", [])
print(
    "ReAct",
    "VA=", round(report.get("va_rate", 0.0), 3),
    "EM=", round(report.get("em_rate", 0.0), 3),
    "EX=", round(report.get("ex_rate", 0.0), 3),
    "TS=", "NA" if report.get("ts_rate") is None else round(report.get("ts_rate", 0.0), 3),
)
print("Saved report:", out_path)

# Canonical compatibility copy for downstream scripts (full run only).
if QUICK_LIMIT is None:
    canonical_path = Path("results/agent/results_react_200.json")
    canonical_path.parent.mkdir(parents=True, exist_ok=True)
    shutil.copy2(out_path, canonical_path)
    print("Updated canonical file:", canonical_path)
else:
    print("Quick run detected; canonical file not updated.")

# Optional: persist outputs to Drive (Colab-safe).
PERSIST_TO_DRIVE = True
DRIVE_PERSIST_ROOT = "/content/drive/MyDrive/nl2sql_persistent_runs"

def persist_agent_run_to_drive(
    out_path: Path,
    run_tag: str = "react_eval",
    persist_root: str = DRIVE_PERSIST_ROOT,
):
    root = Path(persist_root)
    root.mkdir(parents=True, exist_ok=True)

    stamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%SZ")
    dst = root / f"{run_tag}_{stamp}"
    dst.mkdir(parents=True, exist_ok=True)

    if out_path.exists():
        shutil.copy2(out_path, dst / out_path.name)

    profile_path = Path("results/agent/ex_failure_profile.json")
    if profile_path.exists():
        shutil.copy2(profile_path, dst / profile_path.name)

    manifest_lines = [
        f"out_path={out_path}",
        f"run_tag={run_tag}",
        f"quick_limit={QUICK_LIMIT}",
        f"ts_n={TS_N}",
        f"config_name={REACT_CONFIG.name}",
        f"use_schema_link={REACT_CONFIG.use_schema_link}",
        f"use_constraint_policy={REACT_CONFIG.use_constraint_policy}",
        f"use_repair_policy={REACT_CONFIG.use_repair_policy}",
    ]
    (dst / "backup_manifest.txt").write_text("\n".join(manifest_lines), encoding="utf-8")
    return dst

if PERSIST_TO_DRIVE:
    try:
        backup_dir = persist_agent_run_to_drive(out_path=out_path, run_tag=RUN_TAG)
        print("Persistent backup saved to:", backup_dir)
    except Exception as e:
        print("Drive backup skipped/failed:", e)


## 14) Failure Profile


In [None]:
import json
from pathlib import Path
from collections import Counter

# 15B) EX failure profiling (quick categories)
def categorize_ex_failure(item: dict) -> str:
    pred = item.get("pred_sql")
    va = int(item.get("va", 0))
    ex = int(item.get("ex", 0))
    err = str(item.get("error") or "").lower()

    if not pred:
        if "repair_budget_exhausted" in err:
            return "repair_budget_exhausted"
        return "no_prediction"
    if va == 0:
        if "guardrail_reject" in err:
            return "guardrail_reject"
        if "validate_sql" in err:
            return "validate_sql_failed"
        return "invalid_sql"
    if ex == 1:
        return "correct"
    if "validate_constraints" in err:
        return "constraint_mismatch"
    if "intent_mismatch" in err:
        return "intent_mismatch"
    return "semantic_mismatch"

counts = Counter(categorize_ex_failure(r) for r in results)
print("EX failure categories:")
for k, v in counts.most_common():
    print(f"  {k}: {v}")

profile_path = Path("results/agent/ex_failure_profile.json")
profile_path.parent.mkdir(parents=True, exist_ok=True)
profile = {
    "counts": dict(counts),
    "n_items": len(results),
    "quick_limit": QUICK_LIMIT,
    "ts_n": TS_N,
    "config_name": REACT_CONFIG.name,
}
profile_path.write_text(json.dumps(profile, indent=2), encoding="utf-8")
print("Saved failure profile:", profile_path)
