# SETUP AND IMPORTS

In [1]:
import json
import csv
import time
import sqlite3
import os
from pathlib import Path

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
from datasets import Dataset
import sqlglot
from sqlglot import parse_one

# Check environment
print("=" * 60)
print("ENVIRONMENT CHECK")
print("=" * 60)
print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {__import__('transformers').__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    DEVICE = "cuda"
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
else:
    DEVICE = "cpu"
    print("WARNING: No GPU detected! Training will be very slow.")

print(f"Device selected: {DEVICE}")
print("=" * 60)

ENVIRONMENT CHECK
PyTorch version: 2.8.0+cu126
Transformers version: 4.57.1
CUDA available: True
CUDA device: NVIDIA A100-SXM4-80GB
GPU Memory: 79.32 GB
Device selected: cuda


# FILE PATHS & CONFIGURATION

In [2]:

TRAIN_JSONL = Path("Data_for_demo.jsonl")  # The file we created earlier

# For now, we'll create a validation split from the training data
# We don't need separate test data since this is just for demo fine-tuning

# Output directories
OUTPUT_DIR = Path("finetuned_demo_model")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print("\nFILE PATHS:")
print("-" * 60)
print(f"Training data:      {TRAIN_JSONL}")
print(f"Output directory:   {OUTPUT_DIR}")
print("-" * 60)

# Verify file exists
print("\nFILE VERIFICATION:")
print("-" * 60)

if TRAIN_JSONL.exists():
    print(f"Training data: ‚úÖ EXISTS")

    # Count examples
    with open(TRAIN_JSONL, 'r') as f:
        num_examples = sum(1 for line in f)
    print(f"Total examples: {num_examples}")
else:
    print(f"Training data: ‚ùå MISSING")
    print("\n‚ö†Ô∏è WARNING: Please upload train_combined_demo.jsonl before proceeding!")

print("-" * 60)


FILE PATHS:
------------------------------------------------------------
Training data:      Data_for_demo.jsonl
Output directory:   finetuned_demo_model
------------------------------------------------------------

FILE VERIFICATION:
------------------------------------------------------------
Training data: ‚úÖ EXISTS
Total examples: 406
------------------------------------------------------------


# MODEL CONFIGURATION

In [3]:
print("=" * 60)
print("CONFIGURATION PARAMETERS")
print("=" * 60)

# ============================================================
# Model Selection
# ============================================================

# Using BASE Flan-T5 for full parameter fine-tuning
BASE_MODEL_NAME = "juierror/flan-t5-text2sql-with-schema-v2"  # ~250M parameters

print("\nModel Selection:")
print("-" * 60)
print(f"Base model: {BASE_MODEL_NAME}")
print(f"Fine-tuning: Full parameter (all weights trainable)")

# ============================================================
# Training Hyperparameters
# ============================================================

NUM_EPOCHS = 10  # More epochs since we have focused data
BATCH_SIZE = 8   # Adjust based on GPU (use 4 if you get OOM)
LEARNING_RATE = 3e-5  # Slightly higher for full fine-tuning
WARMUP_STEPS = 100
WEIGHT_DECAY = 0.01
LOGGING_STEPS = 25
SAVE_STEPS = 200
EVAL_STEPS = 200

print("\nTraining Hyperparameters:")
print("-" * 60)
print(f"Epochs:              {NUM_EPOCHS}")
print(f"Batch size:          {BATCH_SIZE}")
print(f"Learning rate:       {LEARNING_RATE}")
print(f"Warmup steps:        {WARMUP_STEPS}")
print(f"Weight decay:        {WEIGHT_DECAY}")
print(f"Logging steps:       {LOGGING_STEPS}")
print(f"Save steps:          {SAVE_STEPS}")
print(f"Eval steps:          {EVAL_STEPS}")

# ============================================================
# Generation Parameters
# ============================================================

MAX_INPUT_LENGTH = 512   # Max tokens for input (question + schema)
MAX_TARGET_LENGTH = 256  # Max tokens for output (SQL)

GEN_MAX_LENGTH = 256
GEN_NUM_BEAMS = 4
GEN_TEMPERATURE = 0.0  # Greedy decoding

print("\nGeneration Parameters:")
print("-" * 60)
print(f"Max input length:    {MAX_INPUT_LENGTH}")
print(f"Max target length:   {MAX_TARGET_LENGTH}")
print(f"Generation max len:  {GEN_MAX_LENGTH}")
print(f"Num beams:           {GEN_NUM_BEAMS}")
print(f"Temperature:         {GEN_TEMPERATURE}")

# ============================================================
# Prompt Template
# ============================================================

PROMPT_TEMPLATE = """Question: {question}

Schema:
{schema}

SQL:"""

print("\nPrompt Template:")
print("-" * 60)
print(PROMPT_TEMPLATE.format(
    question="<question here>",
    schema="<schema here>"
))

# ============================================================
# Other Settings
# ============================================================

SEED = 42
FP16 = True if DEVICE == "cuda" else False

print("\nOther Settings:")
print("-" * 60)
print(f"Random seed:         {SEED}")
print(f"FP16 (mixed prec):   {FP16}")

# Set random seed
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print("\n‚úÖ Configuration complete!")
print("=" * 60)

CONFIGURATION PARAMETERS

Model Selection:
------------------------------------------------------------
Base model: juierror/flan-t5-text2sql-with-schema-v2
Fine-tuning: Full parameter (all weights trainable)

Training Hyperparameters:
------------------------------------------------------------
Epochs:              10
Batch size:          8
Learning rate:       3e-05
Warmup steps:        100
Weight decay:        0.01
Logging steps:       25
Save steps:          200
Eval steps:          200

Generation Parameters:
------------------------------------------------------------
Max input length:    512
Max target length:   256
Generation max len:  256
Num beams:           4
Temperature:         0.0

Prompt Template:
------------------------------------------------------------
Question: <question here>

Schema:
<schema here>

SQL:

Other Settings:
------------------------------------------------------------
Random seed:         42
FP16 (mixed prec):   True

‚úÖ Configuration complete!


# LOADING MODEL & TOKENIZER

In [4]:
# ============================================================
# STEP 4: LOADING MODEL & TOKENIZER
# ============================================================

print("=" * 60)
print("LOADING MODEL & TOKENIZER")
print("=" * 60)

# ============================================================
# Load Tokenizer
# ============================================================

print("\nLoading tokenizer...")
print("-" * 60)

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)

print(f"‚úÖ Tokenizer loaded: {BASE_MODEL_NAME}")
print(f"   Vocab size: {len(tokenizer)}")
print(f"   Model max length: {tokenizer.model_max_length}")

# Test tokenization
test_text = "SELECT * FROM Department WHERE DepartmentID = 1;"
test_tokens = tokenizer(test_text, return_tensors="pt")

print(f"\nTokenization test:")
print(f"   Input: {test_text}")
print(f"   Token IDs shape: {test_tokens['input_ids'].shape}")
print(f"   Number of tokens: {test_tokens['input_ids'].shape[1]}")

# ============================================================
# Load Model
# ============================================================

print("\n" + "-" * 60)
print("Loading model (this may take a minute)...")
print("-" * 60)

model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL_NAME)

# Move to device
model = model.to(DEVICE)

print(f"‚úÖ Model loaded: {BASE_MODEL_NAME}")
print(f"   Device: {DEVICE}")

# ============================================================
# Model Statistics
# ============================================================

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"\nModel Statistics:")
print("-" * 60)
print(f"   Total parameters:      {total_params:,}")
print(f"   Trainable parameters:  {trainable_params:,}")
print(f"   Trainable %:           {100 * trainable_params / total_params:.2f}%")
print(f"   Model size (approx):   {total_params * 4 / 1024 / 1024:.2f} MB")

# ============================================================
# Test Generation (before our fine-tuning)
# ============================================================

print("\n" + "-" * 60)
print("Testing generation (before fine-tuning on our 3 datasets)...")
print("-" * 60)

test_prompt = PROMPT_TEMPLATE.format(
    question="How many employees work in each department?",
    schema="""Database: hr_1
Tables:
- employees(employee_id*, first_name, last_name, department_id)
- departments(department_id*, department_name)
FK employees.department_id -> departments.department_id"""
)

print(f"Test prompt:\n{test_prompt}\n")

# Tokenize and generate
inputs = tokenizer(test_prompt, return_tensors="pt", max_length=MAX_INPUT_LENGTH, truncation=True).to(DEVICE)

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_length=GEN_MAX_LENGTH,
        num_beams=GEN_NUM_BEAMS,
        temperature=GEN_TEMPERATURE if GEN_TEMPERATURE > 0 else 1.0,
        do_sample=False
    )

generated_sql = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Generated SQL (baseline, before our fine-tuning):")
print(f"   {generated_sql}")

print("\n‚úÖ Model is ready for fine-tuning on our 3 datasets!")
print("=" * 60)

LOADING MODEL & TOKENIZER

Loading tokenizer...
------------------------------------------------------------


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


‚úÖ Tokenizer loaded: juierror/flan-t5-text2sql-with-schema-v2
   Vocab size: 32101
   Model max length: 512

Tokenization test:
   Input: SELECT * FROM Department WHERE DepartmentID = 1;
   Token IDs shape: torch.Size([1, 15])
   Number of tokens: 15

------------------------------------------------------------
Loading model (this may take a minute)...
------------------------------------------------------------
‚úÖ Model loaded: juierror/flan-t5-text2sql-with-schema-v2
   Device: cuda

Model Statistics:
------------------------------------------------------------
   Total parameters:      247,536,384
   Trainable parameters:  247,536,384
   Trainable %:           100.00%
   Model size (approx):   944.28 MB

------------------------------------------------------------
Testing generation (before fine-tuning on our 3 datasets)...
------------------------------------------------------------
Test prompt:
Question: How many employees work in each department?

Schema:
Database: hr_1
Tables:

# DATA LOADING AND PREPROCESSING

In [5]:
# ============================================================
# STEP 5: DATA LOADING AND PREPROCESSING
# ============================================================

print("=" * 60)
print("DATA LOADING AND PREPROCESSING")
print("=" * 60)

# ============================================================
# Load JSONL File
# ============================================================

def load_jsonl(path):
    """Load JSONL file and return list of dictionaries."""
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                rows.append(json.loads(line))
    return rows

print("\nLoading dataset...")
print("-" * 60)

# Load your combined dataset
train_data_full = load_jsonl("Data_for_demo.jsonl")

print(f"Total examples loaded: {len(train_data_full)}")

# Show breakdown by database
from collections import Counter
db_counts = Counter([ex['dataset'] for ex in train_data_full])

print(f"\nBreakdown by database:")
for db_id, count in sorted(db_counts.items()):
    print(f"  {db_id}: {count} examples")

# Show sample
print(f"\nSample training example:")
sample = train_data_full[0]
print(f"  Dataset: {sample['dataset']}")
print(f"  Question: {sample['question']}")
print(f"  Gold SQL: {sample['gold_query'][:80]}...")

# ============================================================
# Create Train/Validation Split
# ============================================================

print("\n" + "-" * 60)
print("Creating train/validation split (90/10)...")
print("-" * 60)

from sklearn.model_selection import train_test_split

# Split 90% train, 10% validation
train_data, val_data = train_test_split(
    train_data_full,
    test_size=0.1,
    random_state=SEED,
    shuffle=True
)

print(f"Training examples:   {len(train_data)}")
print(f"Validation examples: {len(val_data)}")

# ============================================================
# Preprocessing Function
# ============================================================

def preprocess_function(examples):
    """
    Preprocess examples for fine-tuning.

    Args:
        examples: Dictionary with lists of questions, schemas, and SQL queries

    Returns:
        Dictionary with tokenized inputs and labels
    """
    # Build input prompts
    inputs = []
    for question, schema in zip(examples['question'], examples['schema_serialized']):
        prompt = PROMPT_TEMPLATE.format(question=question, schema=schema)
        inputs.append(prompt)

    # Tokenize inputs
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding=False  # We'll pad dynamically in the data collator
    )

    # Tokenize targets (SQL queries)
    labels = tokenizer(
        text_target=examples['gold_query'],
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
        padding=False
    )

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# ============================================================
# Convert to HuggingFace Dataset Format
# ============================================================

print("\n" + "-" * 60)
print("Converting to HuggingFace Dataset format...")
print("-" * 60)

# Convert to HuggingFace Dataset
train_dataset = Dataset.from_dict({
    'question': [ex['question'] for ex in train_data],
    'schema_serialized': [ex['schema_serialized'] for ex in train_data],
    'gold_query': [ex['gold_query'] for ex in train_data]
})

val_dataset = Dataset.from_dict({
    'question': [ex['question'] for ex in val_data],
    'schema_serialized': [ex['schema_serialized'] for ex in val_data],
    'gold_query': [ex['gold_query'] for ex in val_data]
})

print(f"‚úÖ Datasets converted")
print(f"   Train dataset: {len(train_dataset)} examples")
print(f"   Val dataset:   {len(val_dataset)} examples")

# ============================================================
# Tokenize Datasets
# ============================================================

print("\n" + "-" * 60)
print("Tokenizing datasets (this may take 1-2 minutes)...")
print("-" * 60)

# Tokenize with batching for speed
tokenized_train = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=train_dataset.column_names,
    desc="Tokenizing training data"
)

tokenized_val = val_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=val_dataset.column_names,
    desc="Tokenizing validation data"
)

print(f"‚úÖ Tokenization complete")
print(f"   Tokenized train: {len(tokenized_train)} examples")
print(f"   Tokenized val:   {len(tokenized_val)} examples")

# Show tokenized example
print(f"\nTokenized example (first training sample):")
print(f"   Input IDs length:  {len(tokenized_train[0]['input_ids'])}")
print(f"   Label IDs length:  {len(tokenized_train[0]['labels'])}")

# ============================================================
# Data Collator
# ============================================================

print("\n" + "-" * 60)
print("Setting up data collator...")
print("-" * 60)

# Data collator handles dynamic padding
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
    max_length=MAX_INPUT_LENGTH
)

print(f"‚úÖ Data collator ready")
print(f"   Will pad batches dynamically during training")
print("\n‚úÖ Data preprocessing complete!")
print("=" * 60)

DATA LOADING AND PREPROCESSING

Loading dataset...
------------------------------------------------------------
Total examples loaded: 406

Breakdown by database:
  college_2: 170 examples
  hr_1: 124 examples
  store_1: 112 examples

Sample training example:
  Dataset: store_1
  Question: A list of the top 5 countries by number of invoices. List country name and number of invoices.
  Gold SQL: SELECT billing_country ,  COUNT(*) FROM invoices GROUP BY billing_country ORDER ...

------------------------------------------------------------
Creating train/validation split (90/10)...
------------------------------------------------------------
Training examples:   365
Validation examples: 41

------------------------------------------------------------
Converting to HuggingFace Dataset format...
------------------------------------------------------------
‚úÖ Datasets converted
   Train dataset: 365 examples
   Val dataset:   41 examples

---------------------------------------------------

Tokenizing training data:   0%|          | 0/365 [00:00<?, ? examples/s]

Tokenizing validation data:   0%|          | 0/41 [00:00<?, ? examples/s]

‚úÖ Tokenization complete
   Tokenized train: 365 examples
   Tokenized val:   41 examples

Tokenized example (first training sample):
   Input IDs length:  426
   Label IDs length:  20

------------------------------------------------------------
Setting up data collator...
------------------------------------------------------------
‚úÖ Data collator ready
   Will pad batches dynamically during training

‚úÖ Data preprocessing complete!


# TRAINING SETUP & FINE-TUNING

In [6]:
print("=" * 60)
print("FINE-TUNING SETUP")
print("=" * 60)

# ============================================================
# Training Arguments
# ============================================================

print("\nSetting up training arguments...")
print("-" * 60)

training_args = Seq2SeqTrainingArguments(
    output_dir=str(OUTPUT_DIR),

    # Training hyperparameters
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    warmup_steps=WARMUP_STEPS,

    # Evaluation and logging
    eval_strategy="steps",
    eval_steps=EVAL_STEPS,
    save_strategy="steps",
    save_steps=SAVE_STEPS,
    logging_steps=LOGGING_STEPS,

    # Generation settings for evaluation
    predict_with_generate=True,
    generation_max_length=GEN_MAX_LENGTH,
    generation_num_beams=GEN_NUM_BEAMS,

    # Performance optimizations for Colab GPU
    fp16=FP16,
    dataloader_num_workers=2,

    # Model saving
    save_total_limit=3,  # Keep only 3 best checkpoints
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    # Other settings
    report_to="none",  # Disable wandb/tensorboard
    seed=SEED,
    remove_unused_columns=False,
)

print(f"‚úÖ Training arguments configured")
print(f"   Output directory: {OUTPUT_DIR}")
print(f"   Total epochs: {NUM_EPOCHS}")
print(f"   Steps per epoch: ~{len(tokenized_train) // BATCH_SIZE}")
print(f"   Total training steps: ~{(len(tokenized_train) // BATCH_SIZE) * NUM_EPOCHS}")

# ============================================================
# Initialize Trainer
# ============================================================

print("\n" + "-" * 60)
print("Initializing Seq2SeqTrainer...")
print("-" * 60)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

print(f"‚úÖ Trainer initialized")
print(f"   Training samples: {len(tokenized_train)}")
print(f"   Validation samples: {len(tokenized_val)}")

# ============================================================
# Start Fine-Tuning
# ============================================================

print("\n" + "=" * 60)
print("STARTING FINE-TUNING")
print("=" * 60)
print("\nEstimated time:")
print("   - On T4 GPU (Colab Free): ~25-35 minutes")
print("   - On A100 GPU (Colab Pro): ~8-12 minutes")
print("\nTraining progress will be displayed below...")
print("You'll see:")
print("   - Loss decreasing over time")
print("   - Evaluation metrics every 200 steps")
print("   - Checkpoints saved every 200 steps")
print("=" * 60 + "\n")

# Start training!
train_result = trainer.train()

print("\n" + "=" * 60)
print("üéâ TRAINING COMPLETE!")
print("=" * 60)

# Print training metrics
print(f"\nTraining Metrics:")
print(f"   Total runtime: {train_result.metrics['train_runtime']:.2f} seconds ({train_result.metrics['train_runtime']/60:.1f} minutes)")
print(f"   Samples per second: {train_result.metrics['train_samples_per_second']:.2f}")
print(f"   Steps per second: {train_result.metrics['train_steps_per_second']:.3f}")
print(f"   Final train loss: {train_result.metrics['train_loss']:.4f}")

# ============================================================
# Evaluate on Validation Set
# ============================================================

print("\n" + "-" * 60)
print("Evaluating on validation set...")
print("-" * 60)

eval_result = trainer.evaluate()

print(f"‚úÖ Evaluation complete")
print(f"\nValidation Metrics:")
print(f"   Eval loss: {eval_result['eval_loss']:.4f}")
print(f"   Eval runtime: {eval_result['eval_runtime']:.2f} seconds")
print(f"   Samples per second: {eval_result['eval_samples_per_second']:.2f}")

print("\n" + "=" * 60)
print("‚úÖ Fine-tuning complete! Model is ready for demo.")
print("=" * 60)

FINE-TUNING SETUP

Setting up training arguments...
------------------------------------------------------------
‚úÖ Training arguments configured
   Output directory: finetuned_demo_model
   Total epochs: 10
   Steps per epoch: ~45
   Total training steps: ~450

------------------------------------------------------------
Initializing Seq2SeqTrainer...
------------------------------------------------------------
‚úÖ Trainer initialized
   Training samples: 365
   Validation samples: 41

STARTING FINE-TUNING

Estimated time:
   - On T4 GPU (Colab Free): ~25-35 minutes
   - On A100 GPU (Colab Pro): ~8-12 minutes

Training progress will be displayed below...
You'll see:
   - Loss decreasing over time
   - Evaluation metrics every 200 steps
   - Checkpoints saved every 200 steps



  trainer = Seq2SeqTrainer(


Step,Training Loss,Validation Loss
200,0.0,
400,0.0,





üéâ TRAINING COMPLETE!

Training Metrics:
   Total runtime: 104.40 seconds (1.7 minutes)
   Samples per second: 34.96
   Steps per second: 4.406
   Final train loss: 0.0000

------------------------------------------------------------
Evaluating on validation set...
------------------------------------------------------------




‚úÖ Evaluation complete

Validation Metrics:
   Eval loss: nan
   Eval runtime: 0.49 seconds
   Samples per second: 82.93

‚úÖ Fine-tuning complete! Model is ready for demo.


In [7]:
# ============================================================
# STEP 7: SAVE FINE-TUNED MODEL
# ============================================================

print("=" * 60)
print("SAVING FINE-TUNED MODEL")
print("=" * 60)

# ============================================================
# Save Model and Tokenizer
# ============================================================

print("\nSaving model and tokenizer...")
print("-" * 60)

FINAL_MODEL_DIR = OUTPUT_DIR / "final_model"
FINAL_MODEL_DIR.mkdir(parents=True, exist_ok=True)

# Save model
trainer.save_model(str(FINAL_MODEL_DIR))

# Save tokenizer
tokenizer.save_pretrained(str(FINAL_MODEL_DIR))

print(f"‚úÖ Model saved to: {FINAL_MODEL_DIR}")
print(f"‚úÖ Tokenizer saved to: {FINAL_MODEL_DIR}")

# ============================================================
# Save Training Configuration
# ============================================================

print("\n" + "-" * 60)
print("Saving training configuration...")
print("-" * 60)

training_config = {
    "base_model": BASE_MODEL_NAME,
    "datasets": ["college_2", "hr_1", "store_1"],
    "num_examples": len(train_data_full),
    "train_examples": len(train_data),
    "val_examples": len(val_data),
    "num_epochs": NUM_EPOCHS,
    "batch_size": BATCH_SIZE,
    "learning_rate": LEARNING_RATE,
    "warmup_steps": WARMUP_STEPS,
    "weight_decay": WEIGHT_DECAY,
    "max_input_length": MAX_INPUT_LENGTH,
    "max_target_length": MAX_TARGET_LENGTH,
    "final_train_loss": train_result.metrics.get('train_loss', 'N/A'),
    "final_eval_loss": eval_result.get('eval_loss', 'N/A'),
    "training_runtime_seconds": train_result.metrics.get('train_runtime', 'N/A'),
}

config_path = OUTPUT_DIR / "training_config.json"
with open(config_path, "w", encoding="utf-8") as f:
    json.dump(training_config, f, indent=2, ensure_ascii=False)

print(f"‚úÖ Training config saved to: {config_path}")
print("\n‚úÖ All files saved successfully!")
print("=" * 60)

SAVING FINE-TUNED MODEL

Saving model and tokenizer...
------------------------------------------------------------
‚úÖ Model saved to: finetuned_demo_model/final_model
‚úÖ Tokenizer saved to: finetuned_demo_model/final_model

------------------------------------------------------------
Saving training configuration...
------------------------------------------------------------
‚úÖ Training config saved to: finetuned_demo_model/training_config.json

‚úÖ All files saved successfully!


# TESTING MODEL PERFORMANCE

In [8]:
# ============================================================
# STEP 8: TESTING ON DEMO DATASETS (Modified for direct upload)
# ============================================================

print("=" * 60)
print("TESTING FINE-TUNED MODEL")
print("=" * 60)

# ============================================================
# SQL Utilities
# ============================================================

def canonical_sql(sql_text):
    """Normalize SQL to canonical form using sqlglot."""
    if not sql_text:
        return None
    try:
        ast = parse_one(sql_text, read="sqlite")
        return ast.sql(dialect="sqlite", pretty=False)
    except Exception:
        return None


def try_execute(conn, sql_text):
    """Execute SQL query and return result set."""
    try:
        cur = conn.execute(sql_text)
        rows = cur.fetchall()

        # Normalize floats
        normalized = []
        for row in rows:
            norm_row = []
            for val in row:
                if isinstance(val, float):
                    norm_row.append(round(val, 6))
                else:
                    norm_row.append(val)
            normalized.append(tuple(norm_row))

        return set(normalized), None

    except Exception as e:
        return None, str(e)


def extract_sql(text):
    """Extract SQL from model output."""
    text = text.strip()

    # Remove markdown code blocks if present
    if "```" in text:
        parts = text.split("```")
        for part in parts:
            if "select" in part.lower() or "SELECT" in part:
                text = part.strip()
                if text.lower().startswith("sql"):
                    text = text[3:].strip()
                break

    # Remove common prefixes
    for prefix in ["sql:", "answer:", "query:"]:
        if text.lower().startswith(prefix):
            text = text[len(prefix):].strip()

    # Ensure semicolon
    if ";" in text:
        text = text.split(";", 1)[0] + ";"

    return text.strip()

print("\n‚úÖ SQL utilities defined")

# ============================================================
# Load All Test Data (Full Dataset)
# ============================================================

print("\n" + "-" * 60)
print("Loading test data...")
print("-" * 60)

# Use the full dataset for testing
test_data = train_data_full  # All 406 examples

print(f"Total test examples: {len(test_data)}")

# Breakdown by database
db_counts = Counter([ex['dataset'] for ex in test_data])
print(f"\nTest examples by database:")
for db_id, count in sorted(db_counts.items()):
    print(f"  {db_id}: {count} examples")

# ============================================================
# Prepare Database Connections (Modified for direct upload)
# ============================================================

print("\n" + "-" * 60)
print("Setting up database connections...")
print("-" * 60)

# Dictionary to store database connections
db_connections = {}

# Check for directly uploaded database files (no folder structure)
db_files = {
    'college_2': 'college_2.sqlite',
    'hr_1': 'hr_1.sqlite',
    'store_1': 'store_1.sqlite'
}

# Try to connect to each database
for db_id, db_filename in db_files.items():
    # Check if file exists in current directory
    if os.path.exists(db_filename):
        conn = sqlite3.connect(db_filename)
        conn.execute("PRAGMA foreign_keys=ON")
        db_connections[db_id] = conn
        print(f"‚úÖ Connected to: {db_id} ({db_filename})")
    else:
        print(f"‚ö†Ô∏è  Database not found: {db_filename}")

if not db_connections:
    print("\n‚ö†Ô∏è  WARNING: No databases found!")
    print("   The evaluation will only calculate Exact Match (EM) metrics.")
    print("   Upload database files to calculate EX and Valid SQL metrics.")
else:
    print(f"\n‚úÖ Connected to {len(db_connections)}/3 databases")

# ============================================================
# Evaluation Loop
# ============================================================

print("\n" + "=" * 60)
print("RUNNING EVALUATION")
print("=" * 60)

results = []
n_examples = len(test_data)

# Overall metrics
em_count = 0
ex_count = 0
valid_count = 0
latencies = []

# Per-database metrics
db_metrics = {db: {'em': 0, 'ex': 0, 'valid': 0, 'total': 0} for db in ['college_2', 'hr_1', 'store_1']}

print(f"\nEvaluating on {n_examples} examples...")
print("-" * 60)

eval_model = trainer.model
eval_model.eval()

for i, example in enumerate(test_data, 1):
    question = example['question']
    gold_sql = example['gold_query']
    schema = example['schema_serialized']
    db_id = example['dataset']

    # Build prompt
    prompt = PROMPT_TEMPLATE.format(question=question, schema=schema)

    # Tokenize
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        max_length=MAX_INPUT_LENGTH,
        truncation=True
    ).to(DEVICE)

    # Generate SQL
    start_time = time.time()

    with torch.no_grad():
        outputs = eval_model.generate(
            **inputs,
            max_length=GEN_MAX_LENGTH,
            num_beams=GEN_NUM_BEAMS,
            temperature=GEN_TEMPERATURE if GEN_TEMPERATURE > 0 else 1.0,
            do_sample=False
        )

    gen_time_ms = (time.time() - start_time) * 1000.0
    latencies.append(gen_time_ms)

    # Decode
    pred_sql_raw = tokenizer.decode(outputs[0], skip_special_tokens=True)
    pred_sql_raw = extract_sql(pred_sql_raw)

    # Normalize
    pred_sql_norm = canonical_sql(pred_sql_raw)
    gold_sql_norm = canonical_sql(gold_sql)

    # ============================================================
    # Compute Metrics
    # ============================================================

    # Exact Match (EM)
    em = int(
        pred_sql_norm is not None and
        gold_sql_norm is not None and
        pred_sql_norm == gold_sql_norm
    )

    # Execution Accuracy (EX) and Valid SQL
    valid = 0
    ex_ok = 0
    error = None

    # Get database connection
    conn = db_connections.get(db_id)

    if conn and pred_sql_norm is not None:
        # Try to execute predicted SQL
        pred_rows, error = try_execute(conn, pred_sql_norm)

        if pred_rows is not None:
            valid = 1  # SQL is valid

            # Execute gold SQL
            gold_rows, gold_error = try_execute(conn, gold_sql_norm or gold_sql)

            if gold_rows is not None:
                # Compare result sets
                ex_ok = int(pred_rows == gold_rows)
            else:
                error = f"Gold SQL failed: {gold_error}"
    elif pred_sql_norm is None:
        error = "ParseError: Could not parse predicted SQL"
    else:
        error = f"Database connection not available for {db_id}"

    # Update counters
    em_count += em
    ex_count += ex_ok
    valid_count += valid

    # Update per-database metrics
    db_metrics[db_id]['total'] += 1
    db_metrics[db_id]['em'] += em
    db_metrics[db_id]['ex'] += ex_ok
    db_metrics[db_id]['valid'] += valid

    # Store result
    results.append({
        "id": example.get("id", f"{db_id}_{i}"),
        "dataset": db_id,
        "question": question,
        "gold_sql": gold_sql,
        "pred_sql_raw": pred_sql_raw,
        "pred_sql_norm": pred_sql_norm or "",
        "em": em,
        "ex": ex_ok,
        "valid_sql": valid,
        "latency_ms": round(gen_time_ms, 2),
        "error": error or ""
    })

    # Progress update
    if i % 50 == 0 or i == n_examples:
        print(f"[{i}/{n_examples}] Overall: EM={em_count/i:.3f} EX={ex_count/i:.3f} Valid={valid_count/i:.3f}")

# ============================================================
# Save Results
# ============================================================

print("\n" + "-" * 60)
print("Saving results...")
print("-" * 60)

RESULTS_CSV = OUTPUT_DIR / "test_results_demo_datasets.csv"

with open(RESULTS_CSV, "w", newline="", encoding="utf-8") as f:
    if results:
        writer = csv.DictWriter(f, fieldnames=list(results[0].keys()))
        writer.writeheader()
        writer.writerows(results)

print(f"‚úÖ Results saved to: {RESULTS_CSV}")

# ============================================================
# Summary Statistics
# ============================================================

em_rate = em_count / n_examples
ex_rate = ex_count / n_examples
valid_rate = valid_count / n_examples
median_latency = sorted(latencies)[len(latencies) // 2] if latencies else 0

print("\n" + "=" * 60)
print("EVALUATION SUMMARY")
print("=" * 60)
print(f"\nModel: Fine-tuned on 3 Demo Datasets")
print(f"Base Model: {BASE_MODEL_NAME}")
print(f"Datasets: college_2, hr_1, store_1")
print(f"Total Examples: {n_examples}")

print(f"\nüìä OVERALL METRICS:")
print("-" * 60)
print(f"  Exact Match (EM):        {em_rate:.1%} ({em_count}/{n_examples})")
print(f"  Execution Accuracy (EX): {ex_rate:.1%} ({ex_count}/{n_examples})")
print(f"  Valid-SQL rate:          {valid_rate:.1%} ({valid_count}/{n_examples})")
print(f"  Median generation time:  {median_latency:.1f} ms")

print(f"\nüìä PER-DATABASE BREAKDOWN:")
print("-" * 60)

for db_id in sorted(db_metrics.keys()):
    metrics = db_metrics[db_id]
    total = metrics['total']
    if total > 0:
        print(f"\n{db_id.upper()} ({total} examples):")
        print(f"  EM:    {metrics['em']/total:.1%} ({metrics['em']}/{total})")
        print(f"  EX:    {metrics['ex']/total:.1%} ({metrics['ex']}/{total})")
        print(f"  Valid: {metrics['valid']/total:.1%} ({metrics['valid']}/{total})")

print(f"\nüíæ Results saved to: {RESULTS_CSV}")
print("=" * 60)

# Close database connections
for conn in db_connections.values():
    conn.close()

print("\n‚úÖ Testing complete!")

TESTING FINE-TUNED MODEL

‚úÖ SQL utilities defined

------------------------------------------------------------
Loading test data...
------------------------------------------------------------
Total test examples: 406

Test examples by database:
  college_2: 170 examples
  hr_1: 124 examples
  store_1: 112 examples

------------------------------------------------------------
Setting up database connections...
------------------------------------------------------------
‚úÖ Connected to: college_2 (college_2.sqlite)
‚úÖ Connected to: hr_1 (hr_1.sqlite)
‚úÖ Connected to: store_1 (store_1.sqlite)

‚úÖ Connected to 3/3 databases

RUNNING EVALUATION

Evaluating on 406 examples...
------------------------------------------------------------


KeyboardInterrupt: 