# Purpose
The purpose of this notebook is to apply post-hoc data fixes to results data. Any modification to the data must be retained in this notebook. No modification to performance metrics will be made. Only modifications to independent variables, or additions of new performance metrics will be performed.

In [1]:
import pandas as pd
import os
from tqdm import tqdm
from NlSqlBenchmark.snails.util import sqlite_db_util

In [2]:
# Only run this once!
cwd = os.getcwd()
cwd = os.path.dirname(cwd)
os.chdir(cwd)

### Add prompt token counts to already-completed CHESS subsetting runs

This traverses the CHESS logs and counts the tokens in the prompts used to generate subsets. It saves the counts to the results token_count column for each generated subset.

##### SNAILS

In [None]:
from SchemaSubsetter.ChessSubsetter import ChessSubsetter
results_file = "./subsetting_results/archive/subsetting-chess-snails-Native-gpt4o.xlsx"
results_df = pd.read_excel(results_file)
for row in tqdm(results_df.itertuples(), total=results_df.shape[0]):
    if row.prompt_tokens == 0:
        token_counts, total_tokens = ChessSubsetter.get_token_counts_from_log(row.database, row.question_number)
    else:
        total_tokens = row.prompt_tokens
    results_df.at[row.Index, "prompt_tokens"] = total_tokens

100%|██████████| 503/503 [1:11:45<00:00,  8.56s/it]


In [17]:
results_df.to_excel("./subsetting_results/subsetting-chess-snails-Native-gpt4o.xlsx", index=False)

##### BIRD

In [4]:
from SchemaSubsetter.ChessSubsetter import ChessSubsetter
results_file = "./subsetting_results/archive/subsetting-chess-bird-Native-gpt4o.xlsx"
results_df = pd.read_excel(results_file)
for row in tqdm(results_df.itertuples(), total=results_df.shape[0]):
    if row.prompt_tokens == 0:
        token_counts, total_tokens = ChessSubsetter.get_token_counts_from_log(row.database, row.question_number)
    else:
        total_tokens = row.prompt_tokens
    results_df.at[row.Index, "prompt_tokens"] = total_tokens

NlSqlBenchmark.NlSqlBenchmarkFactory


  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 1534/1534 [35:21<00:00,  1.38s/it]


In [5]:
results_df.to_excel("./subsetting_results/subsetting-chess-bird-Native-gpt4o.xlsx", index=False)

### Index the Snails NYSED database to improve performance

In [3]:

sqlite_db_util.index_nysed_db(db_list_file="./benchmarks/snails/snails_sqlite/sqlite_dbinfo.json")