# Dev Testing

In [0]:
table_name = "dq_dev.lmg_sandbox.config_driven_table_example"

describe_extended_query = """
DESCRIBE EXTENDED
  {table_name}
"""

spark.sql(describe_extended_query.format(table_name=table_name)).show(truncate=False, n=100)

In [0]:
def describe_table_show(spark, fq_table: str):
    df = spark.sql(f"DESCRIBE TABLE EXTENDED {fq_table}")
    print("=== Raw DataFrame Schema ===")
    df.printSchema()
    print("=== Raw DataFrame ===")
    df.show(truncate=False, n=100)
    return df

def dataframe_to_rowdicts(df):
    rows = [row.asDict() for row in df.collect()]
    print("=== Collected Rows ===")
    for r in rows:
        print(r)
    return rows

# Get DataFrame and show it
df = describe_table_show(spark, "dq_dev.lmg_sandbox.config_driven_table_example")

# Convert to Python list of dicts and show those
rows = dataframe_to_rowdicts(df)

In [0]:
from typing import List, Dict, Any

def parse_describe_table(rows: List[Dict[str, Any]]) -> Dict[str, Any]:
    """
    Convert the output of DESCRIBE TABLE EXTENDED (rows as dicts)
    into a nested dict matching YAML's structure.
    """
    table_level = {}
    columns = []
    partitioned_by = []
    constraints = []
    table_properties = {}
    owner = None
    comment = None

    # State tracking
    mode = "columns"

    for row in rows:
        col_name = (row.get("col_name") or "").strip()
        data_type = (row.get("data_type") or "").strip()
        comm = (row.get("comment") or "").strip() if row.get("comment") else None

        # Section transitions
        if col_name == "# Partition Information":
            mode = "partition"
            continue
        elif col_name == "# Detailed Table Information":
            mode = "details"
            continue
        elif col_name == "# Constraints":
            mode = "constraints"
            continue
        elif col_name.startswith("#"):
            mode = "skip"
            continue

        if mode == "columns" and col_name and not col_name.startswith("#"):
            columns.append({
                "name": col_name,
                "datatype": data_type,
                "comment": comm if comm and comm.upper() != "NULL" else "",
                # Placeholders for additional fields
                "nullable": None,
                "tags": {},
                "column_masking_rule": None,
                "default_value": None,
                "variable_value": None,
                "allowed_values": [],
                "column_check_constraints": {},
                "active": True,
            })
        elif mode == "partition" and col_name and col_name != "# col_name":
            partitioned_by.append(col_name)
        elif mode == "details":
            if col_name == "Catalog":
                table_level["catalog"] = data_type
            elif col_name == "Database":
                table_level["schema"] = data_type
            elif col_name == "Table":
                table_level["table"] = data_type
            elif col_name == "Owner":
                owner = data_type
            elif col_name == "Comment":
                comment = data_type
            elif col_name == "Table Properties":
                # Parse table properties string into dict
                for prop in data_type.strip("[]").split(","):
                    if "=" in prop:
                        k, v = prop.split("=", 1)
                        table_properties[k.strip()] = v.strip()
            # Add more detail parsing as needed

        elif mode == "constraints" and col_name and data_type:
            constraints.append((col_name, data_type))

    # Compose snapshot
    table_level["owner"] = owner
    table_level["comment"] = comment
    table_level["partitioned_by"] = partitioned_by
    table_level["table_properties"] = table_properties
    # Parse out PK/unique from constraints
    pk = []
    for cname, dtype in constraints:
        if dtype.startswith("PRIMARY KEY"):
            pk.append(dtype.split("`")[1].replace("`", ""))
    table_level["primary_key"] = pk

    # Final structure
    return {
        "table_level_values": table_level,
        "column_level_values": columns,
    }

# ---- Example usage ----
snapshot = parse_describe_table(rows)
from pprint import pprint
pprint(snapshot)