In [None]:
import json
from pathlib import Path
import pandas as pd

# Load both JSONL files
def load_jsonl(filepath):
    """Load a JSONL file and return list of dictionaries"""
    data = []
    with open(filepath, 'r') as f:
        for line in f:
            line = line.strip()
            if line:  # Skip empty lines
                data.append(json.loads(line))
    return data

# Define paths
base_path = Path('spider2-snow')
current_file = base_path / 'spider2-snow.jsonl'
sql_dir = base_path / 'evaluation_suite' / 'gold' / 'sql'

# Load data
current_data = load_jsonl(current_file)

print(f"Current version (spider2-snow.jsonl): {len(current_data)} entries")

In [None]:
# Convert to DataFrames for easier analysis
df_current = pd.DataFrame(current_data)

# Display basic info
print("=== Current Version Schema ===")
print(df_current.dtypes)
print(f"\nColumns: {list(df_current.columns)}")
print(f"Total entries: {len(df_current)}")


In [None]:
# Get list of SQL files
sql_files = list(sql_dir.glob('*.sql'))
sql_ids = {f.stem for f in sql_files}  # Extract instance_id from filename (without .sql)

print(f"=== SQL Files Analysis ===")
print(f"Total SQL files in gold/sql: {len(sql_files)}")
print(f"\nSample SQL file names: {sorted([f.name for f in sql_files[:10]])}")


In [None]:
# Match entries to SQL files
current_ids = set(df_current["instance_id"])
entries_with_sql = current_ids & sql_ids
entries_without_sql = current_ids - sql_ids
sql_without_entries = sql_ids - current_ids

print(f"=== Entry to SQL Mapping ===")
print(f"Entries with corresponding SQL file: {len(entries_with_sql)}")
print(f"Entries WITHOUT SQL file: {len(entries_without_sql)}")

In [None]:
# Show sample entries with their SQL
def read_sql_file(instance_id):
    """Read SQL file content for a given instance_id"""
    sql_path = sql_dir / f"{instance_id}.sql"
    if sql_path.exists():
        with open(sql_path, 'r') as f:
            return f.read()
    return None

def read_external_knowledge(file_name):
    """Read external knowledge file content for a given file_name"""
    file_path = base_path / 'resource' / 'documents' / file_name
    if file_path.exists():
        with open(file_path, 'r') as f:
            return f.read()
    return None

# Create a new dataframe with the entries that have SQL files
df_sql = df_current[df_current["instance_id"].isin(entries_with_sql)]

# Analyze the entries with external knowledge
df_external = df_sql[df_sql["external_knowledge"].notna()]

print(f"Entries with external knowledge: {len(df_external)}/{len(df_sql)}")

# Sample one entry with external knowledge and access the external knowledge
print(f"\nSample entry with external knowledge:")
print(read_external_knowledge(df_external.iloc[0]["external_knowledge"]))

In [None]:
# save the df_sql, add the sql column
df_sql['sql'] = df_sql['instance_id'].apply(read_sql_file)
df_sql.to_json('df_sql.json', orient='records', indent=4)