In [1]:
import os
import sqlite3
import pandas as pd


def gather_schemas(root_dir: str) -> pd.DataFrame:
    """
    Walk root_dir for .sqlite files. For each, connect and extract
    each table's CREATE statement. Returns a DataFrame with columns:
      - db_id : filename without .sqlite
      - table : table name
      - schema: CREATE TABLE ... statement
    """
    records = []

    for dirpath, _, filenames in os.walk(root_dir):
        for fn in filenames:
            if fn.lower().endswith(".sqlite"):
                path = os.path.join(dirpath, fn)
                db_id = os.path.splitext(fn)[0]
                # connect to sqlite
                conn = sqlite3.connect(path)
                try:
                    cursor = conn.cursor()
                    # get list of tables
                    cursor.execute(
                        "SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%';"
                    )
                    tables = [row[0] for row in cursor.fetchall()]

                    for table in tables:
                        # get CREATE statement
                        cursor.execute(
                            "SELECT sql FROM sqlite_master WHERE type='table' AND name=?;",
                            (table,),
                        )
                        row = cursor.fetchone()
                        schema = row[0] if row and row[0] else ""
                        records.append(
                            {"db_id": db_id, "table": table, "schema": schema}
                        )
                finally:
                    conn.close()

    return pd.DataFrame(records, columns=["db_id", "table", "schema"])

In [2]:
root_dir = "spider_data/database"
output = None
output = "./schemas.csv"

df = gather_schemas(root_dir)

if output:
    df.to_csv(output, index=False)
    print(f"Wrote {len(df)} records to {output}")
else:
    print(df)

Wrote 873 records to ./schemas.csv
