# Agentic Evaluation (ReAct-style)

This notebook adds a minimal ReAct-style loop for NL→SQL. It reuses the same benchmark (`data/classicmodels_test_200.json`) and metrics (VA/EX/EM; TS planned) to measure gains over prompt-only and QLoRA runs.

Plan (step-by-step):
1) Environment + DB connection
2) Load schema summary + test set
3) Load model (base or QLoRA adapters)
4) Define ReAct prompt + loop (Thought → Action → Observation → Refinement)
5) Run evaluation (VA/EX/EM) and save to `results/agent/…`


In [None]:
# 0) Optional: install deps (Colab)
try:
    import google.colab  # noqa: F401
    IN_COLAB = True
except Exception:
    IN_COLAB = False

if IN_COLAB:
    !pip -q install -r requirements.txt
    import torch, transformers, accelerate, peft
    print('torch', torch.__version__, 'cuda', torch.cuda.is_available())
else:
    print('Not in Colab; ensure requirements are installed.')


In [None]:
# 1) Environment + DB
import os
from getpass import getpass
from pathlib import Path

from google.cloud.sql.connector import Connector
from sqlalchemy import create_engine
from sqlalchemy.engine import Engine

INSTANCE_CONNECTION_NAME = os.getenv("INSTANCE_CONNECTION_NAME")
DB_USER = os.getenv("DB_USER")
DB_PASS = os.getenv("DB_PASS")
DB_NAME = os.getenv("DB_NAME") or "classicmodels"

if not INSTANCE_CONNECTION_NAME:
    INSTANCE_CONNECTION_NAME = input("Enter INSTANCE_CONNECTION_NAME: ").strip()
if not DB_USER:
    DB_USER = input("Enter DB_USER: ").strip()
if not DB_PASS:
    DB_PASS = getpass("Enter DB_PASS: ")

connector = Connector()
def getconn():
    return connector.connect(
        INSTANCE_CONNECTION_NAME,
        "pymysql",
        user=DB_USER,
        password=DB_PASS,
        db=DB_NAME,
    )

engine: Engine = create_engine(
    "mysql+pymysql://",
    creator=getconn,
    future=True,
)

with engine.connect() as conn:
    conn.execute("SELECT 1")
print("DB connection OK")


In [None]:
# 2) Load schema summary + test set
import json
from nl2sql.schema import build_schema_summary

SCHEMA_SUMMARY = build_schema_summary(engine, db_name=DB_NAME)

test_path = Path("data/classicmodels_test_200.json")
test_set = json.loads(test_path.read_text(encoding="utf-8"))
print("Test items:", len(test_set))


In [None]:
# 3) Load model (base or QLoRA adapters)
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
ADAPTER_PATH = "results/adapters/qlora_classicmodels"  # set to None to use base model

cc_major, cc_minor = torch.cuda.get_device_capability(0) if torch.cuda.is_available() else (0, 0)
use_bf16 = cc_major >= 8
compute_dtype = torch.bfloat16 if use_bf16 else torch.float16
print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
print("Using bf16:", use_bf16)

tok = AutoTokenizer.from_pretrained(MODEL_ID, token=True)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
)

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    torch_dtype=compute_dtype,
    device_map={"": 0} if torch.cuda.is_available() else None,
    token=True,
)
base_model.generation_config.do_sample = False
base_model.generation_config.temperature = 1.0
base_model.generation_config.top_p = 1.0

if ADAPTER_PATH:
    model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
    print("Loaded adapters from", ADAPTER_PATH)
else:
    model = base_model
    print("Using base model only")

model.eval()


In [None]:
# 4) ReAct helpers: prompt builder and loop
import re
from nl2sql.query_runner import QueryRunner

runner = QueryRunner(engine)

def build_react_prompt(nlq: str, schema_text: str, history: list, observation: str) -> str:
    history_text = "\n".join([
        f"Thought/Action: {h['ta']}\nObservation: {h['obs']}" for h in history
    ])
    return f"""
You are an expert SQL agent. Follow the steps:
1) Think briefly about the question.
2) Propose a single SQL query.
3) If previous Observation reports an ERROR, fix it.

Schema:
{schema_text}

Question: {nlq}

Previous trace:
{history_text}
Observation: {observation}

Now reply with SQL only.
"""

def extract_sql(text: str) -> str:
    # Simple heuristic: grab first SELECT ... ; span
    m = re.search(r"(SELECT[\s\S]+?);", text, re.IGNORECASE)
    return m.group(1) + ";" if m else text.strip()

def react_sql(nlq: str, schema_text: str, max_steps: int = 3):
    history = []
    observation = "Start."
    final_sql = None
    for step in range(max_steps):
        prompt = build_react_prompt(nlq, schema_text, history, observation)
        inputs = tok(prompt, return_tensors="pt").to(model.device)
        with torch.no_grad():
            out = model.generate(**inputs, max_new_tokens=256)
        gen = tok.decode(out[0], skip_special_tokens=True)
        sql = extract_sql(gen)
        try:
            # Execute to drive the loop; limit rows to force evaluation
            runner.run(sql + " LIMIT 1" if " limit " not in sql.lower() else sql)
            observation = "SUCCESS"
            final_sql = sql
            break
        except Exception as e:
            observation = f"ERROR: {e}"
        history.append({"ta": gen, "obs": observation})
    return final_sql or sql, history


In [None]:
# 5) Evaluation loop (VA/EX/EM). TS is planned.
from nl2sql.eval import execution_accuracy

results = []
for i, sample in enumerate(test_set, start=1):
    nlq = sample["nlq"]
    gold_sql = sample["sql"]
    pred_sql, trace = react_sql(nlq, SCHEMA_SUMMARY, max_steps=3)
    va = 0
    em = int(pred_sql.strip().rstrip(";").lower() == gold_sql.strip().rstrip(";").lower())
    ex = 0
    try:
        runner.run(pred_sql)
        va = 1
        ex = int(execution_accuracy(engine, pred_sql, gold_sql))
    except Exception:
        va = 0
        ex = 0
    results.append({
        "nlq": nlq,
        "gold_sql": gold_sql,
        "pred_sql": pred_sql,
        "va": va,
        "em": em,
        "ex": ex,
        "trace": trace,
    })
    if i % 20 == 0:
        print(f"Processed {i}/{len(test_set)}")

va_rate = sum(r["va"] for r in results) / len(results)
ex_rate = sum(r["ex"] for r in results) / len(results)
em_rate = sum(r["em"] for r in results) / len(results)
print("VA:", va_rate, "EX:", ex_rate, "EM:", em_rate)

Path("results/agent").mkdir(parents=True, exist_ok=True)
save_path = Path("results/agent/results_react_200.json")
save_path.write_text(json.dumps({
    "va_rate": va_rate,
    "ex_rate": ex_rate,
    "em_rate": em_rate,
    "items": results,
}, ensure_ascii=False, indent=2), encoding="utf-8")
print("Saved to", save_path)
