In [1]:
database_folder = "spider_data/database"
query_file_path = "spider_data/dev.json"

schema_output_path = "prepare_data/dev_schemas.csv"
final_result_path = "prepare_data/dev_input.csv"

In [2]:
import os
import sys

sys.path.append(os.getcwd() + "/M-Schema")

from schema_engine import SchemaEngine

### Load full database schemas and tables

In [3]:
import os
import sqlite3
import pandas as pd
from sqlalchemy import create_engine


def gather_schemas(root_dir: str) -> pd.DataFrame:
    """
    Walk root_dir for .sqlite files. For each, connect and extract
    each table's CREATE statement. Returns a DataFrame with columns:
      - db_id : filename without .sqlite
      - table : table name
      - schema: CREATE TABLE ... statement
    """
    records = []

    for dirpath, _, filenames in os.walk(root_dir):
        for fn in filenames:
            if fn.lower().endswith(".sqlite"):
                path = os.path.join(dirpath, fn)
                db_id = os.path.splitext(fn)[0]
                # connect to sqlite
                conn = sqlite3.connect(path)
                try:
                    cursor = conn.cursor()
                    # get list of tables
                    cursor.execute(
                        "SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%';"
                    )
                    tables = [row[0] for row in cursor.fetchall()]

                    for table in tables:
                        # get CREATE statement
                        cursor.execute(
                            "SELECT sql FROM sqlite_master WHERE type='table' AND name=?;",
                            (table,),
                        )
                        row = cursor.fetchone()
                        schema = row[0] if row and row[0] else None
                        cursor.execute(f"SELECT * FROM {table} LIMIT 3;")
                        ex_rows = cursor.fetchall()
                        ex_rows_str = f"Table: {table}\n" + "\n".join(
                            [str(row) for row in ex_rows]
                        )
                        schema_engine = SchemaEngine(
                            engine=create_engine(f"sqlite:///{path}"), db_name=db_id
                        )
                        mschema = schema_engine.mschema
                        mschema_str = mschema.to_mschema()
                        records.append(
                            {
                                "db_id": db_id,
                                "table": table,
                                "schema": schema,
                                "example_rows": ex_rows_str,
                                "mschema": mschema_str,
                            }
                        )
                finally:
                    conn.close()

    return pd.DataFrame(records)

In [4]:
schema_df = gather_schemas(database_folder)
schema_df

  self._metadata.reflect(
  return self._inspector.get_foreign_keys(table_name, self._tables_schemas[table_name])
  table = Table(table_name, self.metadata_obj, autoload_with=self._engine, schema=self._tables_schemas[table_name])
  self._metadata.reflect(
  return self._inspector.get_pk_constraint(table_name, self._tables_schemas[table_name] )['constrained_columns']
  table = Table(table_name, self.metadata_obj, autoload_with=self._engine, schema=self._tables_schemas[table_name])
  self._metadata.reflect(
  return self._inspector.get_foreign_keys(table_name, self._tables_schemas[table_name])
  table = Table(table_name, self.metadata_obj, autoload_with=self._engine, schema=self._tables_schemas[table_name])
  self._metadata.reflect(
  table = Table(table_name, self.metadata_obj, autoload_with=self._engine, schema=self._tables_schemas[table_name])
  return self._inspector.get_foreign_keys(table_name, self._tables_schemas[table_name])
  self._metadata.reflect(
  return self._inspector.get_

Unnamed: 0,db_id,table,schema,example_rows,mschema
0,browser_web,Web_client_accelerator,"CREATE TABLE ""Web_client_accelerator"" (\n""id"" ...","Table: Web_client_accelerator\n(1, 'CACHEbox',...",【DB_ID】 browser_web\n【Schema】\n# Table: main.W...
1,browser_web,browser,"CREATE TABLE ""browser"" (\n""id"" int,\n""name"" te...","Table: browser\n(1, 'Internet Explorer', 28.96...",【DB_ID】 browser_web\n【Schema】\n# Table: main.W...
2,browser_web,accelerator_compatible_browser,"CREATE TABLE ""accelerator_compatible_browser"" ...","Table: accelerator_compatible_browser\n(1, 1, ...",【DB_ID】 browser_web\n【Schema】\n# Table: main.W...
3,musical,musical,"CREATE TABLE ""musical"" (\n""Musical_ID"" int,\n""...","Table: musical\n(1, 'The Phantom of the Opera'...",【DB_ID】 musical\n【Schema】\n# Table: main.actor...
4,musical,actor,"CREATE TABLE ""actor"" (\n""Actor_ID"" int,\n""Name...","Table: actor\n(1, 'Ray Meagher', 1, 'Alf Stewa...",【DB_ID】 musical\n【Schema】\n# Table: main.actor...
...,...,...,...,...,...
868,body_builder,people,"CREATE TABLE ""people"" (\n""People_ID"" int,\n""Na...","Table: people\n(1, 'Jack Campbell', 182.0, 80....",【DB_ID】 body_builder\n【Schema】\n# Table: main....
869,school_player,school,"CREATE TABLE ""school"" (\n""School_ID"" int,\n""Sc...","Table: school\n(1, ""St Aloysius' College"", 'Mi...",【DB_ID】 school_player\n【Schema】\n# Table: main...
870,school_player,school_details,"CREATE TABLE ""school_details"" (\n""School_ID"" i...","Table: school_details\n(1, 'Tigers', 'Blue and...",【DB_ID】 school_player\n【Schema】\n# Table: main...
871,school_player,school_performance,"CREATE TABLE ""school_performance"" (\n""School_I...","Table: school_performance\n(1, '1987-88', 'Yan...",【DB_ID】 school_player\n【Schema】\n# Table: main...


In [5]:
schema_df.to_csv(schema_output_path)

In [6]:
schemas_map = {}
example_rows_map = {}
mschemas_map = {}
for _, row in schema_df.iterrows():
    schemas_map.setdefault(row.db_id, {})[row.table.lower()] = row.schema
    example_rows_map.setdefault(row.db_id, {})[row.table.lower()] = row.example_rows
    mschemas_map.setdefault(row.db_id, {})[row.table.lower()] = row.mschema

### Parse queries

In [7]:
import re
from typing import List, Set

import sqlglot
from sqlglot import exp


def extract_tables(sql: str) -> List[str]:
    """
    Parse the given SQL and return a sorted list of all
    real table names referenced (joins, subqueries, CTE bodies, etc.),
    but *exclude* any CTE aliases.
    """
    # 1) Pull out the CTE block (if any) and collect its aliases
    cte_names: Set[str] = set()
    match = re.search(
        r"WITH\s+(.*?)\)\s*SELECT",
        sql,
        flags=re.IGNORECASE | re.DOTALL,
    )
    if match:
        cte_block = match.group(1)
        # find all "<name> AS (" inside that block
        found = re.findall(
            r"([A-Za-z_][A-Za-z0-9_]*)\s+AS\b", cte_block, flags=re.IGNORECASE
        )
        cte_names = set(found)

    # 2) Parse into an AST
    try:
        tree = sqlglot.parse_one(sql)
    except sqlglot.errors.ParseError as e:
        raise ValueError(f"Failed to parse SQL: {e}")

    # 3) Walk every Table node and collect its .this (the table identifier),
    #    unless it’s one of the CTE names we just saw.
    tables: Set[str] = set()
    for tbl in tree.find_all(exp.Table):
        name = tbl.name  # e.g. 'sales', 'customers', 'archived_sales'
        if name not in cte_names:
            tables.add(name)

    return sorted(tables)


def lookup_schemas(row):
    db = row.db_id
    tables = row.tables  # list of table names
    db_map = schemas_map.get(db, {})
    # collect schema for each table, skip if not found
    return "\n".join(
        [db_map.get(tbl.lower(), f"<no schema for {tbl}>") for tbl in tables]
    )


def lookup_example_rows(row):
    db = row.db_id
    tables = row.tables
    db_map = example_rows_map.get(db, {})
    return "\n".join(
        [db_map.get(tbl.lower(), f"<no example rows for {tbl}>") for tbl in tables]
    )


def lookup_mschemas(row):
    db = row.db_id
    tables = row.tables
    db_map = mschemas_map.get(db, {})
    return "\n".join(
        [db_map.get(tbl.lower(), f"<no mschema for {tbl}>") for tbl in tables]
    )

In [8]:
import pandas as pd

df = pd.read_json(query_file_path)
df = (
    df.assign(
        question_number=df.index,
        tables=lambda df_: df_["query"].apply(extract_tables),
    )
    .assign(
        schemas=lambda df_: df_.apply(lookup_schemas, axis=1),
        example_rows=lambda df_: df_.apply(lookup_example_rows, axis=1),
        mschemas=lambda df_: df_.apply(lookup_mschemas, axis=1),
    )
    .loc[
        :,
        [
            "question_number",
            "question",
            "db_id",
            "tables",
            "schemas",
            "example_rows",
            "mschemas",
        ],
    ]
)
df

Unnamed: 0,question_number,question,db_id,tables,schemas,example_rows,mschemas
0,0,How many singers do we have?,concert_singer,[singer],"CREATE TABLE ""singer"" (\n""Singer_ID"" int,\n""Na...","Table: singer\n(1, 'Joe Sharp', 'Netherlands',...",【DB_ID】 concert_singer\n【Schema】\n# Table: mai...
1,1,What is the total number of singers?,concert_singer,[singer],"CREATE TABLE ""singer"" (\n""Singer_ID"" int,\n""Na...","Table: singer\n(1, 'Joe Sharp', 'Netherlands',...",【DB_ID】 concert_singer\n【Schema】\n# Table: mai...
2,2,"Show name, country, age for all singers ordere...",concert_singer,[singer],"CREATE TABLE ""singer"" (\n""Singer_ID"" int,\n""Na...","Table: singer\n(1, 'Joe Sharp', 'Netherlands',...",【DB_ID】 concert_singer\n【Schema】\n# Table: mai...
3,3,"What are the names, countries, and ages for ev...",concert_singer,[singer],"CREATE TABLE ""singer"" (\n""Singer_ID"" int,\n""Na...","Table: singer\n(1, 'Joe Sharp', 'Netherlands',...",【DB_ID】 concert_singer\n【Schema】\n# Table: mai...
4,4,"What is the average, minimum, and maximum age ...",concert_singer,[singer],"CREATE TABLE ""singer"" (\n""Singer_ID"" int,\n""Na...","Table: singer\n(1, 'Joe Sharp', 'Netherlands',...",【DB_ID】 concert_singer\n【Schema】\n# Table: mai...
...,...,...,...,...,...,...,...
1029,1029,What are the citizenships that are shared by s...,singer,[singer],"CREATE TABLE ""singer"" (\n""Singer_ID"" int,\n""Na...","Table: singer\n(1, 'Liliane Bettencourt', 1944...",【DB_ID】 singer\n【Schema】\n# Table: main.singer...
1030,1030,How many available features are there in total?,real_estate_properties,[Other_Available_Features],CREATE TABLE `Other_Available_Features` (\n`fe...,"Table: Other_Available_Features\n(2, 'Amenity'...",【DB_ID】 real_estate_properties\n【Schema】\n# Ta...
1031,1031,What is the feature type name of feature AirCon?,real_estate_properties,"[Other_Available_Features, Ref_Feature_Types]",CREATE TABLE `Other_Available_Features` (\n`fe...,"Table: Other_Available_Features\n(2, 'Amenity'...",【DB_ID】 real_estate_properties\n【Schema】\n# Ta...
1032,1032,Show the property type descriptions of propert...,real_estate_properties,"[Properties, Ref_Property_Types]",CREATE TABLE `Properties` (\n`property_id` INT...,"Table: Properties\n(1, 'House', '1991-06-21 23...",【DB_ID】 real_estate_properties\n【Schema】\n# Ta...


In [9]:
df.to_csv(final_result_path, index=False)