Import

In [1]:
from accelerate.utils import set_seed
from tqdm import tqdm

import json
import numpy as np
import random
import torch
from utils.functions import set_seed_all, set_result_filename, run_end2end, check_nltk_resource
import os

from evaluators.llm_evaluator import LLMEvaluator, LLMLoraEvaluator
from generators.llm_generator import LLMGenerator, LLMLoraGenerator

Parameters

In [2]:
llm_names =[
    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    "stabilityai/stable-code-3b",
    "deepseek-ai/deepseek-coder-1.3b-base"
]


# generate lora model names
lora_model_names = []
for m in llm_names:
   lora_model_names.append( m.split("/")[1]+"_spider")

In [3]:
seed = 42

# evaluator
model_indx = 2 # choose the model to evaluate
evaluator_name = llm_names[model_indx] #base model name
model_savename = lora_model_names[model_indx] #lora model save name
evaluation_config = "evaluation_configs/pro.json"

# generator
model_indx = 2 # choose the model to generate
generator_name = llm_names[model_indx] #base model name
generator_lora_savename = lora_model_names[model_indx] #lora model save name
generation_config = "generation_configs/temp_sampling.json" # there are two configs for generation: temp_sampling.json (5 candidates) and greedy.json (1 candidate)

# populate other parameters
print(f"evaluator_name: {evaluator_name}")
print(f"model_savename: {model_savename}")
current_directory = os.getcwd() #parameters
model_savedatapath = os.path.join(current_directory,f"checkpts/{model_savename}/model")
evaluator_peft_dir = model_savedatapath


test_fname = "data/spider_dev_400.json"
dataset_name = "spider"
db_path ="spider/database"
method_name = "rerank" # planning method
# result_fname: where the results will be saved for evaluation
result_fname = f"results/{set_result_filename(evaluator_name, generator_name, dataset_name, method_name)}_pro_e2e" + ".sql"
log_name = f"log/{set_result_filename(evaluator_name, generator_name, dataset_name, method_name)}_pro_e2e" + ".json"

retriever_gen = None # retriever generator
retriever_eval = None # retriever evaluator

"""
yes_token_indx: 
    the index of the token in the vocabulary that corresponds to the "Yes" text.
    CodeLlama-Instruct: "No" 1939 "Yes" 3869
    TinyLlama: "Yes" 3869
"""
yes_token_indx=None#3869

evaluator_name: stabilityai/stable-code-3b
model_savename: stable-code-3b_spider


In [4]:
# set seed
set_seed_all(seed)

Load Evaluator LLM

In [5]:
evaluator = LLMEvaluator(evaluator_name, db_path, device="cuda",yes_token_indx=yes_token_indx)
#evaluator = LLMLoraEvaluator(evaluator_name, evaluator_peft_dir, db_path, device="cuda",yes_token_indx=yes_token_indx)

yindx=evaluator.get_yes_token()
print(f"Yes token index: {yindx}")    

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Yes token index: 4374


Load Generator LLM

In [6]:
generator = LLMGenerator(generator_name, device="cuda")
#generator = LLMLoraGenerator(generator_name, generator_peft_dir, device="cuda")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Planning method

In [None]:
if method_name == "rerank":
    from planning_methods.llm_planner import rerank as planner
else:
    raise ValueError(f"Unknown planning method: {method_name}")

End2end run and store results for evalulation

In [11]:
run_end2end(generator, evaluator,generation_config, \
            evaluation_config, planner, retriever_gen, retriever_eval, \
                test_fname,dataset_name,result_fname,log_name)

100%|██████████| 3/3 [27:25<00:00, 548.67s/it]


In [19]:
check_nltk_resource()



In [20]:
import subprocess

db = db_path # the directory that contains all the databases and test suites
table = "spider/tables.json" # the tables.json schema file
pred = result_fname # the path to the predicted queries
gold = "data/spider_dev_400_gold.sql" # the path to the gold queries
etype = "all" # evaluation type, exec for test suite accuracy, match for the original exact set match accuracy
pscript = "test-suite-sql-eval/evaluation.py" # the evaluation script

cmd = [
    "python", "-u", pscript,
    "--gold", gold,
    "--pred", pred,
    "--db", db,
    "--table", table,
    "--etype", etype
]

result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)
print(result.stderr)  # Check for errors


easy pred: SELECT DISTINCT s.country   FROM singer AS s   JOIN singer_in_concert AS sic   ON s.singer_id = sic.singer_id   JOIN concert AS c   ON sic.concert_id = c.concert_id   WHERE s.age > 20   GROUP BY s.country;
easy gold: SELECT DISTINCT country FROM singer WHERE age > 20;

easy pred: SELECT DISTINCT course_name FROM courses ;
easy gold: SELECT DISTINCT t1.course_name FROM courses AS t1 JOIN student_enrolment_courses AS t2 ON t1.course_id = t2.course_id;

easy pred: SELECT COUNT(DISTINCT winner_name) FROM matches ;
easy gold: SELECT COUNT(DISTINCT loser_name) FROM matches;

                     easy                 medium               hard                 extra                all                 
count                3                    0                    0                    0                    3                   
execution            0.333                0.000                0.000                0.000                0.333               

exact match          0.000        