# Dev Testing

In [0]:
table_name = "dq_dev.lmg_sandbox.config_driven_table_example"

describe_extended_query = """
DESCRIBE EXTENDED
  {table_name}
"""

spark.sql(describe_extended_query.format(table_name=table_name)).show(truncate=False, n=100)

In [0]:
def describe_table_show(spark, fq_table: str):
    df = spark.sql(f"DESCRIBE TABLE EXTENDED {fq_table}")
    print("=== Raw DataFrame Schema ===")
    df.printSchema()
    print("=== Raw DataFrame ===")
    df.show(truncate=False, n=100)
    return df

def dataframe_to_rowdicts(df):
    rows = [row.asDict() for row in df.collect()]
    print("=== Collected Rows ===")
    for r in rows:
        print(r)
    return rows

# Get DataFrame and show it
df = describe_table_show(spark, "dq_dev.lmg_sandbox.config_driven_table_example")

# Convert to Python list of dicts and show those
rows = dataframe_to_rowdicts(df)

In [0]:
from typing import List, Dict, Any

def parse_describe_table(rows: List[Dict[str, Any]]) -> Dict[str, Any]:
    """
    Convert the output of DESCRIBE TABLE EXTENDED (rows as dicts)
    into a nested dict matching YAML's structure.
    """
    table_level = {}
    columns = []
    partitioned_by = []
    constraints = []
    table_properties = {}
    owner = None
    comment = None

    # State tracking
    mode = "columns"

    for row in rows:
        col_name = (row.get("col_name") or "").strip()
        data_type = (row.get("data_type") or "").strip()
        comm = (row.get("comment") or "").strip() if row.get("comment") else None

        # Section transitions
        if col_name == "# Partition Information":
            mode = "partition"
            continue
        elif col_name == "# Detailed Table Information":
            mode = "details"
            continue
        elif col_name == "# Constraints":
            mode = "constraints"
            continue
        elif col_name.startswith("#"):
            mode = "skip"
            continue

        if mode == "columns" and col_name and not col_name.startswith("#"):
            columns.append({
                "name": col_name,
                "datatype": data_type,
                "comment": comm if comm and comm.upper() != "NULL" else "",
                # Placeholders for additional fields
                "nullable": None,
                "tags": {},
                "column_masking_rule": None,
                "default_value": None,
                "variable_value": None,
                "allowed_values": [],
                "column_check_constraints": {},
                "active": True,
            })
        elif mode == "partition" and col_name and col_name != "# col_name":
            partitioned_by.append(col_name)
        elif mode == "details":
            if col_name == "Catalog":
                table_level["catalog"] = data_type
            elif col_name == "Database":
                table_level["schema"] = data_type
            elif col_name == "Table":
                table_level["table"] = data_type
            elif col_name == "Owner":
                owner = data_type
            elif col_name == "Comment":
                comment = data_type
            elif col_name == "Table Properties":
                # Parse table properties string into dict
                for prop in data_type.strip("[]").split(","):
                    if "=" in prop:
                        k, v = prop.split("=", 1)
                        table_properties[k.strip()] = v.strip()
            # Add more detail parsing as needed

        elif mode == "constraints" and col_name and data_type:
            constraints.append((col_name, data_type))

    # Compose snapshot
    table_level["owner"] = owner
    table_level["comment"] = comment
    table_level["partitioned_by"] = partitioned_by
    table_level["table_properties"] = table_properties
    # Parse out PK/unique from constraints
    pk = []
    for cname, dtype in constraints:
        if dtype.startswith("PRIMARY KEY"):
            pk.append(dtype.split("`")[1].replace("`", ""))
    table_level["primary_key"] = pk

    # Final structure
    return {
        "table_level_values": table_level,
        "column_level_values": columns,
    }

# ---- Example usage ----
snapshot = parse_describe_table(rows)
from pprint import pprint
pprint(snapshot)

In [0]:
from typing import List, Dict
from pyspark.sql import SparkSession

def parse_fully_qualified_table(fq_table: str):
    """Split catalog.schema.table into (catalog, schema, table)"""
    parts = fq_table.split(".")
    if len(parts) != 3:
        raise ValueError(f"Expected catalog.schema.table, got: {fq_table}")
    return parts[0], parts[1], parts[2]

def spark_sql_to_rows(spark: SparkSession, sql: str) -> List[dict]:
    """Runs a Spark SQL and returns the result as a list of dicts."""
    df = spark.sql(sql)
    return [row.asDict() for row in df.collect()]

def get_table_tags(spark: SparkSession, fq_table: str) -> Dict[str, str]:
    """
    Return all tags for the given table as a dict: {tag_name: tag_value, ...}
    """
    catalog, schema, table = parse_fully_qualified_table(fq_table)
    sql = f"""
        SELECT tag_name, tag_value
        FROM system.information_schema.table_tags
        WHERE catalog_name = '{catalog}'
          AND schema_name = '{schema}'
          AND table_name = '{table}'
    """
    rows = spark_sql_to_rows(spark, sql)
    return {row['tag_name']: row['tag_value'] for row in rows}

def get_column_tags(spark: SparkSession, fq_table: str) -> Dict[str, Dict[str, str]]:
    """
    Return all tags for each column in the table as:
        {column_name: {tag_name: tag_value, ...}, ...}
    """
    catalog, schema, table = parse_fully_qualified_table(fq_table)
    sql = f"""
        SELECT column_name, tag_name, tag_value
        FROM system.information_schema.column_tags
        WHERE catalog_name = '{catalog}'
          AND schema_name = '{schema}'
          AND table_name = '{table}'
    """
    rows = spark_sql_to_rows(spark, sql)
    col_tags = {}
    for row in rows:
        col = row['column_name']
        tag = row['tag_name']
        val = row['tag_value']
        if col not in col_tags:
            col_tags[col] = {}
        col_tags[col][tag] = val
    return col_tags

def get_row_filters(spark: SparkSession, fq_table: str) -> List[dict]:
    """
    Return all row filters for a table as a list of dicts.
    """
    catalog, schema, table = parse_fully_qualified_table(fq_table)
    sql = f"""
        SELECT filter_name, target_columns
        FROM system.information_schema.row_filters
        WHERE table_catalog = '{catalog}'
          AND table_schema = '{schema}'
          AND table_name = '{table}'
    """
    return spark_sql_to_rows(spark, sql)

def get_constraint_table_usage(spark: SparkSession, fq_table: str) -> List[dict]:
    """
    Return all constraints defined on the table (e.g. PK, Unique, FK).
    """
    catalog, schema, table = parse_fully_qualified_table(fq_table)
    sql = f"""
        SELECT constraint_name
        FROM system.information_schema.constraint_table_usage
        WHERE table_catalog = '{catalog}'
          AND table_schema = '{schema}'
          AND table_name = '{table}'
    """
    return spark_sql_to_rows(spark, sql)

def get_constraint_column_usage(spark: SparkSession, fq_table: str) -> List[dict]:
    """
    Return all constraints for all columns on the table.
    """
    catalog, schema, table = parse_fully_qualified_table(fq_table)
    sql = f"""
        SELECT column_name, constraint_name
        FROM system.information_schema.constraint_column_usage
        WHERE table_catalog = '{catalog}'
          AND table_schema = '{schema}'
          AND table_name = '{table}'
    """
    return spark_sql_to_rows(spark, sql)
  
fq = "dq_dev.lmg_sandbox.config_driven_table_example"

  # 1. Table tags
print("=== Table Tags ===")
print(get_table_tags(spark, fq))

# 2. Column tags
print("=== Column Tags ===")
print(get_column_tags(spark, fq))

# 3. Row filters
print("=== Row Filters ===")
print(get_row_filters(spark, fq))

# 4. Table constraints
print("=== Table Constraints ===")
print(get_constraint_table_usage(spark, fq))

# 5. Column constraints
print("=== Column Constraints ===")
print(get_constraint_column_usage(spark, fq))

In [0]:
from typing import List, Dict, Any, Tuple
import re
from pyspark.sql import SparkSession

# --- Spark SQL helpers ---
def spark_sql_to_df(spark: SparkSession, sql: str):
    """
    Run Spark SQL and return the DataFrame. Logs error, re-raises on failure.
    """
    try:
        return spark.sql(sql)
    except Exception as e:
        print(f"[ERROR] spark_sql_to_df failed: {e}\nSQL: {sql}")
        raise

def spark_sql_to_rows(spark: SparkSession, sql: str):
    """
    Run Spark SQL and return results as list of dicts. Logs error, re-raises on failure.
    """
    try:
        df = spark.sql(sql)
        return [row.asDict() for row in df.collect()]
    except Exception as e:
        print(f"[ERROR] spark_sql_to_rows failed: {e}\nSQL: {sql}")
        raise

# --- Table introspection ---
def describe_table_to_rows(spark: SparkSession, fq_table: str) -> List[Dict[str, Any]]:
    sql = f"DESCRIBE TABLE EXTENDED {fq_table}"
    return spark_sql_to_rows(spark, sql)

def extract_columns(spark: SparkSession, fq_table: str) -> List[Dict[str, Any]]:
    rows = describe_table_to_rows(spark, fq_table)
    cols = []
    for row in rows:
        col_name = (row.get("col_name") or "").strip()
        data_type = (row.get("data_type") or "").strip()
        comm = (row.get("comment") or "").strip() if row.get("comment") else None
        if col_name and not col_name.startswith("#"):
            cols.append({
                "name": col_name,
                "datatype": data_type,
                "comment": comm if comm and comm.upper() != "NULL" else "",
                "nullable": None,
                "tags": {},
                "column_masking_rule": None,
                "default_value": None,
                "variable_value": None,
                "allowed_values": [],
                "column_check_constraints": {},
                "active": True,
            })
    return cols

def extract_partitioned_by(spark: SparkSession, fq_table: str) -> List[str]:
    rows = describe_table_to_rows(spark, fq_table)
    collecting = False
    partition_cols = []
    for row in rows:
        col_name = (row.get("col_name") or "").strip()
        if col_name == "# Partition Information":
            collecting = True
            continue
        elif col_name.startswith("#") and collecting:
            break
        elif collecting and col_name and col_name != "# col_name":
            partition_cols.append(col_name)
    return partition_cols

def extract_details(spark: SparkSession, fq_table: str) -> Dict[str, Any]:
    rows = describe_table_to_rows(spark, fq_table)
    details = {}
    props = {}
    owner = None
    comment = None
    in_details = False
    for row in rows:
        col_name = (row.get("col_name") or "").strip()
        data_type = (row.get("data_type") or "").strip()
        if col_name == "# Detailed Table Information":
            in_details = True
            continue
        if in_details:
            if col_name == "" or col_name.startswith("#"):
                break
            if col_name == "Catalog":
                details["catalog"] = data_type
            elif col_name == "Database":
                details["schema"] = data_type
            elif col_name == "Table":
                details["table"] = data_type
            elif col_name == "Owner":
                owner = data_type
            elif col_name == "Comment":
                comment = data_type
            elif col_name == "Table Properties":
                for prop in data_type.strip("[]").split(","):
                    if "=" in prop:
                        k, v = prop.split("=", 1)
                        props[k.strip()] = v.strip()
    details["owner"] = owner
    details["comment"] = comment
    details["table_properties"] = props
    return details

def extract_constraints(spark: SparkSession, fq_table: str) -> List[Tuple[str, str]]:
    rows = describe_table_to_rows(spark, fq_table)
    constraints = []
    in_constraints = False
    for row in rows:
        col_name = (row.get("col_name") or "").strip()
        data_type = (row.get("data_type") or "").strip()
        if col_name == "# Constraints":
            in_constraints = True
            continue
        if in_constraints:
            if not col_name or col_name.startswith("#"):
                break
            if col_name and data_type:
                constraints.append((col_name, data_type))
    return constraints

def extract_primary_key(spark: SparkSession, fq_table: str) -> List[str]:
    constraints = extract_constraints(spark, fq_table)
    pk = []
    for cname, dtype in constraints:
        if "PRIMARY KEY" in dtype:
            m = re.search(r"\((.*?)\)", dtype)
            if m:
                cols = [c.strip().replace("`", "") for c in m.group(1).split(",")]
                pk += cols
    return pk

def parse_describe_table(spark: SparkSession, fq_table: str) -> Dict[str, Any]:
    details = extract_details(spark, fq_table)
    columns = extract_columns(spark, fq_table)
    partitioned_by = extract_partitioned_by(spark, fq_table)
    pk = extract_primary_key(spark, fq_table)
    details["partitioned_by"] = partitioned_by
    details["primary_key"] = pk
    return {
        "table_level_values": details,
        "column_level_values": columns,
    }

# ---- TEST ALL FUNCTIONS ----

fq_table = "dq_dev.lmg_sandbox.config_driven_table_example"   # <--- update as needed

print("==== COLUMNS ====")
print(extract_columns(spark, fq_table))

print("==== PARTITIONED BY ====")
print(extract_partitioned_by(spark, fq_table))

print("==== DETAILS ====")
print(extract_details(spark, fq_table))

print("==== CONSTRAINTS ====")
print(extract_constraints(spark, fq_table))

print("==== PRIMARY KEY ====")
print(extract_primary_key(spark, fq_table))

print("==== FULL SNAPSHOT ====")
from pprint import pprint
pprint(parse_describe_table(spark, fq_table))

In [0]:
def describe_table_to_rows(spark, fq_table: str):
    """Run DESCRIBE TABLE EXTENDED and return rows as list of dicts."""
    df = spark.sql(f"DESCRIBE TABLE EXTENDED {fq_table}")
    return [row.asDict() for row in df.collect()]


from typing import List, Dict, Any, Optional
import re

def extract_columns(describe_rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """Extract just the column definitions from DESCRIBE output."""
    columns = []
    for row in describe_rows:
        col_name = (row.get("col_name") or "").strip()
        data_type = (row.get("data_type") or "").strip()
        comment = (row.get("comment") or "").strip() if row.get("comment") else None

        # Columns section (stops at # Partition Info)
        if col_name == "" or col_name.startswith("#"):
            if col_name == "# Partition Information":
                break
            continue
        columns.append({
            "name": col_name,
            "datatype": data_type,
            "comment": comment if comment and comment.upper() != "NULL" else "",
            # only these three fields are real here
        })
    return columns

def extract_partitioned_by(describe_rows: List[Dict[str, Any]]) -> List[str]:
    """Extract partition columns (if any)."""
    collecting = False
    partition_cols = []
    for row in describe_rows:
        col_name = (row.get("col_name") or "").strip()
        if col_name == "# Partition Information":
            collecting = True
            continue
        if collecting:
            if not col_name or col_name.startswith("#"):
                break
            if col_name != "# col_name":
                partition_cols.append(col_name)
    return partition_cols

def extract_table_details(describe_rows: List[Dict[str, Any]]) -> Dict[str, Any]:
    """Extract catalog, schema, table, owner, comment, table_properties only."""
    details = {}
    table_properties = {}
    in_details = False
    for row in describe_rows:
        col_name = (row.get("col_name") or "").strip()
        data_type = (row.get("data_type") or "").strip()
        if col_name == "# Detailed Table Information":
            in_details = True
            continue
        if in_details:
            if not col_name or col_name.startswith("#"):
                break
            if col_name == "Catalog":
                details["catalog"] = data_type
            elif col_name == "Database":
                details["schema"] = data_type
            elif col_name == "Table":
                details["table"] = data_type
            elif col_name == "Owner":
                details["owner"] = data_type
            elif col_name == "Comment":
                details["comment"] = data_type
            elif col_name == "Table Properties":
                for prop in data_type.strip("[]").split(","):
                    if "=" in prop:
                        k, v = prop.split("=", 1)
                        table_properties[k.strip()] = v.strip()
    details["table_properties"] = table_properties
    return details

def extract_constraints(describe_rows: List[Dict[str, Any]]) -> List[Dict[str, str]]:
    """Extract table constraints as dicts: name/type"""
    constraints = []
    in_constraints = False
    for row in describe_rows:
        col_name = (row.get("col_name") or "").strip()
        data_type = (row.get("data_type") or "").strip()
        if col_name == "# Constraints":
            in_constraints = True
            continue
        if in_constraints:
            if not col_name or col_name.startswith("#"):
                break
            if col_name and data_type:
                constraints.append({"name": col_name, "type": data_type})
    return constraints

def extract_primary_key(describe_rows: List[Dict[str, Any]]) -> Optional[List[str]]:
    """Find and parse PRIMARY KEY constraint, if present."""
    cons = extract_constraints(describe_rows)
    for c in cons:
        if "PRIMARY KEY" in c["type"]:
            m = re.search(r"\((.*?)\)", c["type"])
            if m:
                return [col.strip().replace("`", "") for col in m.group(1).split(",")]
    return None


###########################################################################


from typing import List, Dict
from pyspark.sql import SparkSession

def parse_fully_qualified_table(fq_table: str):
    """Split catalog.schema.table into (catalog, schema, table)"""
    parts = fq_table.split(".")
    if len(parts) != 3:
        raise ValueError(f"Expected catalog.schema.table, got: {fq_table}")
    return parts[0], parts[1], parts[2]

def spark_sql_to_rows(spark: SparkSession, sql: str) -> List[dict]:
    """Runs a Spark SQL and returns the result as a list of dicts."""
    df = spark.sql(sql)
    return [row.asDict() for row in df.collect()]

def get_table_tags(spark: SparkSession, fq_table: str) -> Dict[str, str]:
    """
    Return all tags for the given table as a dict: {tag_name: tag_value, ...}
    """
    catalog, schema, table = parse_fully_qualified_table(fq_table)
    sql = f"""
        SELECT tag_name, tag_value
        FROM system.information_schema.table_tags
        WHERE catalog_name = '{catalog}'
          AND schema_name = '{schema}'
          AND table_name = '{table}'
    """
    rows = spark_sql_to_rows(spark, sql)
    return {row['tag_name']: row['tag_value'] for row in rows}

def get_column_tags(spark: SparkSession, fq_table: str) -> Dict[str, Dict[str, str]]:
    """
    Return all tags for each column in the table as:
        {column_name: {tag_name: tag_value, ...}, ...}
    """
    catalog, schema, table = parse_fully_qualified_table(fq_table)
    sql = f"""
        SELECT column_name, tag_name, tag_value
        FROM system.information_schema.column_tags
        WHERE catalog_name = '{catalog}'
          AND schema_name = '{schema}'
          AND table_name = '{table}'
    """
    rows = spark_sql_to_rows(spark, sql)
    col_tags = {}
    for row in rows:
        col = row['column_name']
        tag = row['tag_name']
        val = row['tag_value']
        if col not in col_tags:
            col_tags[col] = {}
        col_tags[col][tag] = val
    return col_tags

def get_row_filters(spark: SparkSession, fq_table: str) -> List[dict]:
    """
    Return all row filters for a table as a list of dicts.
    """
    catalog, schema, table = parse_fully_qualified_table(fq_table)
    sql = f"""
        SELECT filter_name, target_columns
        FROM system.information_schema.row_filters
        WHERE table_catalog = '{catalog}'
          AND table_schema = '{schema}'
          AND table_name = '{table}'
    """
    return spark_sql_to_rows(spark, sql)

def get_constraint_table_usage(spark: SparkSession, fq_table: str) -> List[dict]:
    """
    Return all constraints defined on the table (e.g. PK, Unique, FK).
    """
    catalog, schema, table = parse_fully_qualified_table(fq_table)
    sql = f"""
        SELECT constraint_name
        FROM system.information_schema.constraint_table_usage
        WHERE table_catalog = '{catalog}'
          AND table_schema = '{schema}'
          AND table_name = '{table}'
    """
    return spark_sql_to_rows(spark, sql)

def get_constraint_column_usage(spark: SparkSession, fq_table: str) -> List[dict]:
    """
    Return all constraints for all columns on the table.
    """
    catalog, schema, table = parse_fully_qualified_table(fq_table)
    sql = f"""
        SELECT column_name, constraint_name
        FROM system.information_schema.constraint_column_usage
        WHERE table_catalog = '{catalog}'
          AND table_schema = '{schema}'
          AND table_name = '{table}'
    """
    return spark_sql_to_rows(spark, sql)

In [0]:
# --- Run these in order to see exactly what you get ---
fq = "dq_dev.lmg_sandbox.config_driven_table_example"
rows = describe_table_to_rows(spark, fq)

print("\n--- Columns ---")
print(extract_columns(rows))

print("\n--- Partitioned By ---")
print(extract_partitioned_by(rows))

print("\n--- Table Details ---")
print(extract_table_details(rows))

print("\n--- Constraints ---")
print(extract_constraints(rows))

print("\n--- Primary Key ---")
print(extract_primary_key(rows))


###################################################################


# 1. Table tags
print("=== Table Tags ===")
print(get_table_tags(spark, fq))

# 2. Column tags
print("=== Column Tags ===")
print(get_column_tags(spark, fq))

# 3. Row filters
print("=== Row Filters ===")
print(get_row_filters(spark, fq))

# 4. Table constraints
print("=== Table Constraints ===")
print(get_constraint_table_usage(spark, fq))

# 5. Column constraints
print("=== Column Constraints ===")
print(get_constraint_column_usage(spark, fq))

In [0]:
# --- Config: Slim and Correct ---
SNAPSHOT_QUERIES = {
    "table_tags": {
        "table": "system.information_schema.table_tags",
        "columns": ["tag_name", "tag_value"],
    },
    "column_tags": {
        "table": "system.information_schema.column_tags",
        "columns": ["column_name", "tag_name", "tag_value"],
    },
    "row_filters": {
        "table": "system.information_schema.row_filters",
        "columns": ["filter_name", "target_columns"],
    },
    "constraint_table_usage": {
        "table": "system.information_schema.constraint_table_usage",
        "columns": ["constraint_name"],
    },
    "constraint_column_usage": {
        "table": "system.information_schema.constraint_column_usage",
        "columns": ["column_name", "constraint_name"],
    },
}

def parse_fully_qualified_table_name(fq_table: str):
    """Splits 'catalog.schema.table' into catalog, schema, table."""
    parts = fq_table.split(".")
    if len(parts) != 3:
        raise ValueError("Expected format: catalog.schema.table")
    return parts[0], parts[1], parts[2]

def build_metadata_sql(kind: str, fq_table: str) -> str:
    """Builds SQL for the given metadata kind and table."""
    catalog, schema, table = parse_fully_qualified_table_name(fq_table)
    config = SNAPSHOT_QUERIES[kind]
    columns = ", ".join(config["columns"])
    return f"""
        SELECT {columns}
        FROM {config['table']}
        WHERE table_catalog = '{catalog}'
          AND table_schema = '{schema}'
          AND table_name = '{table}'
    """

# --- Table to inspect ---
table_name = "dq_dev.lmg_sandbox.config_driven_table_example"

# --- Run and print all queries ---
for kind in SNAPSHOT_QUERIES:
    print(f"\n--- {kind.upper()} ---")
    sql = build_metadata_sql(kind, table_name)
    print(f"SQL: {sql.strip()}")
    try:
        df = spark.sql(sql)
        rows = df.collect()
        for row in rows:
            print(row.asDict())
    except Exception as e:
        print(f"[ERROR] {e}")

In [0]:
# Get describe extended output as a list of dicts (describe_rows)




# --- Your extract functions (copy these in above) ---

def extract_columns(describe_rows):
    columns = []
    for row in describe_rows:
        col_name = (row.get("col_name") or "").strip()
        data_type = (row.get("data_type") or "").strip()
        comment = (row.get("comment") or "").strip() if row.get("comment") else None

        if col_name == "" or col_name.startswith("#"):
            if col_name == "# Partition Information":
                break
            continue
        columns.append({
            "name": col_name,
            "datatype": data_type,
            "comment": comment if comment and comment.upper() != "NULL" else "",
        })
    return columns

def extract_partitioned_by(describe_rows):
    collecting = False
    partition_cols = []
    for row in describe_rows:
        col_name = (row.get("col_name") or "").strip()
        if col_name == "# Partition Information":
            collecting = True
            continue
        if collecting:
            if not col_name or col_name.startswith("#"):
                break
            if col_name != "# col_name":
                partition_cols.append(col_name)
    return partition_cols

def extract_table_details(describe_rows):
    details = {}
    table_properties = {}
    in_details = False
    for row in describe_rows:
        col_name = (row.get("col_name") or "").strip()
        data_type = (row.get("data_type") or "").strip()
        if col_name == "# Detailed Table Information":
            in_details = True
            continue
        if in_details:
            if not col_name or col_name.startswith("#"):
                break
            if col_name == "Owner":
                details["owner"] = data_type
            elif col_name == "Comment":
                details["comment"] = data_type
            elif col_name == "Table Properties":
                for prop in data_type.strip("[]").split(","):
                    if "=" in prop:
                        k, v = prop.split("=", 1)
                        table_properties[k.strip()] = v.strip()
    details["table_properties"] = table_properties
    return details

def extract_constraints(describe_rows):
    constraints = []
    in_constraints = False
    for row in describe_rows:
        col_name = (row.get("col_name") or "").strip()
        data_type = (row.get("data_type") or "").strip()
        if col_name == "# Constraints":
            in_constraints = True
            continue
        if in_constraints:
            if not col_name or col_name.startswith("#"):
                break
            if col_name and data_type:
                constraints.append({"name": col_name, "type": data_type})
    return constraints

# --- Run each extraction and print results ---
print("--- COLUMNS ---")
print(extract_columns(describe_rows))

print("\n--- PARTITIONED BY ---")
print(extract_partitioned_by(describe_rows))

print("\n--- TABLE DETAILS ---")
print(extract_table_details(describe_rows))

print("\n--- CONSTRAINTS ---")
print(extract_constraints(describe_rows))

In [0]:
import re
from typing import List, Dict, Any
from pyspark.sql import SparkSession

# --- SYSTEM TABLE SNAPSHOT QUERIES ---
SNAPSHOT_QUERIES = {
    "table_tags": {
        "table": "system.information_schema.table_tags",
        "columns": ["catalog_name", "schema_name", "table_name", "tag_name", "tag_value"],
        "where_keys": [("catalog_name", 0), ("schema_name", 1), ("table_name", 2)],
    },
    "column_tags": {
        "table": "system.information_schema.column_tags",
        "columns": ["catalog_name", "schema_name", "table_name", "column_name", "tag_name", "tag_value"],
        "where_keys": [("catalog_name", 0), ("schema_name", 1), ("table_name", 2)],
    },
    "row_filters": {
        "table": "system.information_schema.row_filters",
        "columns": ["table_catalog", "table_schema", "table_name", "filter_name", "target_columns"],
        "where_keys": [("table_catalog", 0), ("table_schema", 1), ("table_name", 2)],
    },
    "constraint_table_usage": {
        "table": "system.information_schema.constraint_table_usage",
        "columns": ["constraint_catalog", "constraint_schema", "constraint_name"],
        "where_keys": [("table_catalog", 0), ("table_schema", 1), ("table_name", 2)],
    },
    "constraint_column_usage": {
        "table": "system.information_schema.constraint_column_usage",
        "columns": ["column_name", "constraint_name"],
        "where_keys": [("table_catalog", 0), ("table_schema", 1), ("table_name", 2)],
    },
}

def parse_fq_table(fq_table: str):
    parts = fq_table.split(".")
    if len(parts) != 3:
        raise ValueError("Expected format: catalog.schema.table")
    return parts[0], parts[1], parts[2]

def build_metadata_sql(kind: str, fq_table: str) -> str:
    config = SNAPSHOT_QUERIES[kind]
    catalog, schema, table = parse_fq_table(fq_table)
    table_vars = [catalog, schema, table]
    where_clauses = [
        f"{col_name} = '{table_vars[idx]}'"
        for col_name, idx in config["where_keys"]
    ]
    columns = ", ".join(config["columns"])
    return f"SELECT {columns} FROM {config['table']} WHERE {' AND '.join(where_clauses)}"

def get_metadata_snapshot(spark: SparkSession, fq_table: str) -> Dict[str, List[Dict[str, Any]]]:
    results = {}
    for kind in SNAPSHOT_QUERIES:
        try:
            sql = build_metadata_sql(kind, fq_table)
            df = spark.sql(sql)
            rows = [row.asDict() for row in df.collect()]
            results[kind] = rows
        except Exception as e:
            results[kind] = f"[ERROR] {e}"
    return results

# --- DESCRIBE TABLE EXTENDED PARSERS ---
def get_describe_rows(spark: SparkSession, fq_table: str) -> List[Dict[str, Any]]:
    sql = f"DESCRIBE EXTENDED {fq_table}"
    df = spark.sql(sql)
    return [row.asDict() for row in df.collect()]

def extract_columns(describe_rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    columns = []
    for row in describe_rows:
        col_name = (row.get("col_name") or "").strip()
        data_type = (row.get("data_type") or "").strip()
        comment = (row.get("comment") or "").strip() if row.get("comment") else None
        if col_name == "" or col_name.startswith("#"):
            if col_name == "# Partition Information":
                break
            continue
        columns.append({
            "name": col_name,
            "datatype": data_type,
            "comment": comment if comment and comment.upper() != "NULL" else "",
        })
    return columns

def extract_partitioned_by(describe_rows: List[Dict[str, Any]]) -> List[str]:
    collecting = False
    partition_cols = []
    for row in describe_rows:
        col_name = (row.get("col_name") or "").strip()
        if col_name == "# Partition Information":
            collecting = True
            continue
        if collecting:
            if not col_name or col_name.startswith("#"):
                break
            if col_name != "# col_name":
                partition_cols.append(col_name)
    return partition_cols

def extract_table_details(describe_rows: List[Dict[str, Any]]) -> Dict[str, Any]:
    details = {}
    table_properties = {}
    in_details = False
    for row in describe_rows:
        col_name = (row.get("col_name") or "").strip()
        data_type = (row.get("data_type") or "").strip()
        if col_name == "# Detailed Table Information":
            in_details = True
            continue
        if in_details:
            if not col_name or col_name.startswith("#"):
                break
            if col_name == "Owner":
                details["owner"] = data_type
            elif col_name == "Comment":
                details["comment"] = data_type
            elif col_name == "Table Properties":
                for prop in data_type.strip("[]").split(","):
                    if "=" in prop:
                        k, v = prop.split("=", 1)
                        table_properties[k.strip()] = v.strip()
    details["table_properties"] = table_properties
    return details

def extract_constraints(describe_rows: List[Dict[str, Any]]) -> List[Dict[str, str]]:
    constraints = []
    in_constraints = False
    for row in describe_rows:
        col_name = (row.get("col_name") or "").strip()
        data_type = (row.get("data_type") or "").strip()
        if col_name == "# Constraints":
            in_constraints = True
            continue
        if in_constraints:
            if not col_name or col_name.startswith("#"):
                break
            if col_name and data_type:
                constraints.append({"name": col_name, "type": data_type})
    return constraints

# ------------------------------------------------------------------

# --- Example usage, all prints at the bottom: ---
table_name = "dq_dev.lmg_sandbox.config_driven_table_example"
spark = SparkSession.getActiveSession() or SparkSession.builder.getOrCreate()

# Fetch Unity Catalog system metadata (raw output)
uc_metadata = get_metadata_snapshot(spark, table_name)

# Fetch DESCRIBE EXTENDED rows (raw output)
describe_rows = get_describe_rows(spark, table_name)

# --- Now print results ---
for kind, rows in uc_metadata.items():
    print(f"\n--- {kind.upper()} ---")
    if isinstance(rows, str) and rows.startswith("[ERROR]"):
        print(rows)
    elif not rows:
        print("No rows found.")
    else:
        for row in rows:
            print(row)

print("\n--- COLUMNS ---")
columns = extract_columns(describe_rows)
print(columns if columns else "No columns found.")

print("\n--- PARTITIONED BY ---")
partitioned_by = extract_partitioned_by(describe_rows)
print(partitioned_by if partitioned_by else "No partitions found.")

print("\n--- TABLE DETAILS ---")
table_details = extract_table_details(describe_rows)
print(table_details if table_details else "No table details found.")

print("\n--- CONSTRAINTS ---")
constraints = extract_constraints(describe_rows)
print(constraints if constraints else "No constraints found.")

In [0]:
def build_table_metadata_snapshot(
    fq_table: str,
    uc_metadata: Dict[str, List[Dict[str, Any]]],
    describe_rows: List[Dict[str, Any]]
) -> Dict[str, Any]:
    catalog, schema, table = parse_fq_table(fq_table)
    # Table tags
    table_tags = {row["tag_name"]: row["tag_value"] for row in uc_metadata.get("table_tags", [])}
    # Table properties, owner, comment
    details = extract_table_details(describe_rows)
    # Table check constraints (if present in table_properties, or elsewhere)
    table_check_constraints = {
        k: {"name": k, "expression": v}
        for k, v in details.get("table_properties", {}).items()
        if k.startswith("delta.constraints")
    }
    # Row filters
    row_filters = [
        {"filter_name": row["filter_name"], "target_columns": row["target_columns"]}
        for row in uc_metadata.get("row_filters", [])
    ]
    # Partition columns
    partitioned_by = extract_partitioned_by(describe_rows)
    # Constraints
    constraints = extract_constraints(describe_rows)
    # Primary key: from constraints
    pk = []
    for c in constraints:
        if "PRIMARY KEY" in c["type"]:
            m = re.search(r"\((.*?)\)", c["type"])
            if m:
                pk = [col.strip().replace("`", "") for col in m.group(1).split(",")]
    # Columns (by index)
    columns_raw = extract_columns(describe_rows)
    # Column tags (merge by column name)
    col_tag_lookup = {}
    for row in uc_metadata.get("column_tags", []):
        col = row["column_name"]
        if col not in col_tag_lookup:
            col_tag_lookup[col] = {}
        col_tag_lookup[col][row["tag_name"]] = row["tag_value"]
    # Column check constraints (by constraint_column_usage)
    col_constraint_lookup = {}
    for row in uc_metadata.get("constraint_column_usage", []):
        col = row["column_name"]
        cons = row["constraint_name"]
        if col not in col_constraint_lookup:
            col_constraint_lookup[col] = {}
        col_constraint_lookup[col][cons] = {"name": cons}  # Expression requires deeper parsing if needed

    # Build columns dictionary by position (1-based, as in your spec)
    columns = {}
    for idx, col in enumerate(columns_raw, start=1):
        colname = col["name"]
        columns[idx] = {
            "column_name": colname,
            "datatype": col["datatype"],
            "comment": col["comment"],
            "nullable": None,  # Could be extracted if needed
            "masking_rule": None,  # Could be extracted if needed
            "column_tags": col_tag_lookup.get(colname, {}),
            "column_check_constraints": col_constraint_lookup.get(colname, {}),
        }

    result = {
        "table": {
            "fully_qualified_name": fq_table,
            "catalog": catalog,
            "schema": schema,
            "table": table,
            "owner": details.get("owner", ""),
            "comment": details.get("comment", ""),
            "table_properties": details.get("table_properties", {}),
            "table_tags": table_tags,
            "table_check_constraints": table_check_constraints,
            "row_filters": row_filters,
            "partitioned_by": partitioned_by,
            "primary_key": pk,
            "columns": columns,
        }
    }
    return result

# --- Usage Example ---
table_name = "dq_dev.lmg_sandbox.config_driven_table_example"
spark = SparkSession.getActiveSession() or SparkSession.builder.getOrCreate()
uc_metadata = get_metadata_snapshot(spark, table_name)
describe_rows = get_describe_rows(spark, table_name)

snapshot = build_table_metadata_snapshot(table_name, uc_metadata, describe_rows)
import pprint; pprint.pprint(snapshot, width=120)

### Snapshot

#### Table

In [0]:
import re
from typing import List, Dict, Any
from pyspark.sql import SparkSession


############################################################
                    Start of First Section                    
############################################################

# --- SYSTEM TABLE SNAPSHOT QUERIES ---
SNAPSHOT_QUERIES = {
    "table_tags": {
        "table": "system.information_schema.table_tags",
        "columns": ["catalog_name", "schema_name", "table_name", "tag_name", "tag_value"],
        "where_keys": [("catalog_name", 0), ("schema_name", 1), ("table_name", 2)],
    },
    "column_tags": {
        "table": "system.information_schema.column_tags",
        "columns": ["catalog_name", "schema_name", "table_name", "column_name", "tag_name", "tag_value"],
        "where_keys": [("catalog_name", 0), ("schema_name", 1), ("table_name", 2)],
    },
    "row_filters": {
        "table": "system.information_schema.row_filters",
        "columns": ["table_catalog", "table_schema", "table_name", "filter_name", "target_columns"],
        "where_keys": [("table_catalog", 0), ("table_schema", 1), ("table_name", 2)],
    },
    "constraint_table_usage": {
        "table": "system.information_schema.constraint_table_usage",
        "columns": ["constraint_catalog", "constraint_schema", "constraint_name"],
        "where_keys": [("table_catalog", 0), ("table_schema", 1), ("table_name", 2)],
    },
    "constraint_column_usage": {
        "table": "system.information_schema.constraint_column_usage",
        "columns": ["column_name", "constraint_name"],
        "where_keys": [("table_catalog", 0), ("table_schema", 1), ("table_name", 2)],
    },
}

def parse_fq_table(fq_table: str):
    parts = fq_table.split(".")
    if len(parts) != 3:
        raise ValueError("Expected format: catalog.schema.table")
    return parts[0], parts[1], parts[2]

def build_metadata_sql(kind: str, fq_table: str) -> str:
    config = SNAPSHOT_QUERIES[kind]
    catalog, schema, table = parse_fq_table(fq_table)
    table_vars = [catalog, schema, table]
    where_clauses = [
        f"{col_name} = '{table_vars[idx]}'"
        for col_name, idx in config["where_keys"]
    ]
    columns = ", ".join(config["columns"])
    return f"SELECT {columns} FROM {config['table']} WHERE {' AND '.join(where_clauses)}"

def get_metadata_snapshot(spark: SparkSession, fq_table: str) -> Dict[str, List[Dict[str, Any]]]:
    results = {}
    for kind in SNAPSHOT_QUERIES:
        try:
            sql = build_metadata_sql(kind, fq_table)
            df = spark.sql(sql)
            rows = [row.asDict() for row in df.collect()]
            results[kind] = rows
        except Exception as e:
            results[kind] = f"[ERROR] {e}"
    return results

# --- DESCRIBE TABLE EXTENDED PARSERS ---
def get_describe_rows(spark: SparkSession, fq_table: str) -> List[Dict[str, Any]]:
    sql = f"DESCRIBE EXTENDED {fq_table}"
    df = spark.sql(sql)
    return [row.asDict() for row in df.collect()]

def extract_columns(describe_rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    columns = []
    for row in describe_rows:
        col_name = (row.get("col_name") or "").strip()
        data_type = (row.get("data_type") or "").strip()
        comment = (row.get("comment") or "").strip() if row.get("comment") else None
        if col_name == "" or col_name.startswith("#"):
            if col_name == "# Partition Information":
                break
            continue
        columns.append({
            "name": col_name,
            "datatype": data_type,
            "comment": comment if comment and comment.upper() != "NULL" else "",
        })
    return columns

def extract_partitioned_by(describe_rows: List[Dict[str, Any]]) -> List[str]:
    collecting = False
    partition_cols = []
    for row in describe_rows:
        col_name = (row.get("col_name") or "").strip()
        if col_name == "# Partition Information":
            collecting = True
            continue
        if collecting:
            if not col_name or col_name.startswith("#"):
                break
            if col_name != "# col_name":
                partition_cols.append(col_name)
    return partition_cols

def extract_table_details(describe_rows: List[Dict[str, Any]]) -> Dict[str, Any]:
    details = {}
    table_properties = {}
    in_details = False
    for row in describe_rows:
        col_name = (row.get("col_name") or "").strip()
        data_type = (row.get("data_type") or "").strip()
        if col_name == "# Detailed Table Information":
            in_details = True
            continue
        if in_details:
            if not col_name or col_name.startswith("#"):
                break
            if col_name == "Owner":
                details["owner"] = data_type
            elif col_name == "Comment":
                details["comment"] = data_type
            elif col_name == "Table Properties":
                for prop in data_type.strip("[]").split(","):
                    if "=" in prop:
                        k, v = prop.split("=", 1)
                        table_properties[k.strip()] = v.strip()
    details["table_properties"] = table_properties
    return details

def extract_constraints(describe_rows: List[Dict[str, Any]]) -> List[Dict[str, str]]:
    constraints = []
    in_constraints = False
    for row in describe_rows:
        col_name = (row.get("col_name") or "").strip()
        data_type = (row.get("data_type") or "").strip()
        if col_name == "# Constraints":
            in_constraints = True
            continue
        if in_constraints:
            if not col_name or col_name.startswith("#"):
                break
            if col_name and data_type:
                constraints.append({"name": col_name, "type": data_type})
    return constraints

# ------------------------------------------------------------------
"""
# --- Example usage, all prints at the bottom: ---
table_name = "dq_dev.lmg_sandbox.config_driven_table_example"
spark = SparkSession.getActiveSession() or SparkSession.builder.getOrCreate()

# Fetch Unity Catalog system metadata (raw output)
uc_metadata = get_metadata_snapshot(spark, table_name)

# Fetch DESCRIBE EXTENDED rows (raw output)
describe_rows = get_describe_rows(spark, table_name)

# --- Now print results ---
for kind, rows in uc_metadata.items():
    print(f"\n--- {kind.upper()} ---")
    if isinstance(rows, str) and rows.startswith("[ERROR]"):
        print(rows)
    elif not rows:
        print("No rows found.")
    else:
        for row in rows:
            print(row)

print("\n--- COLUMNS ---")
columns = extract_columns(describe_rows)
print(columns if columns else "No columns found.")

print("\n--- PARTITIONED BY ---")
partitioned_by = extract_partitioned_by(describe_rows)
print(partitioned_by if partitioned_by else "No partitions found.")

print("\n--- TABLE DETAILS ---")
table_details = extract_table_details(describe_rows)
print(table_details if table_details else "No table details found.")

print("\n--- CONSTRAINTS ---")
constraints = extract_constraints(describe_rows)
print(constraints if constraints else "No constraints found.")
"""

############################################################
                    End of First Section                    
############################################################                


# -------------------------------------------------------- #


############################################################
                    Start of Second Section                    
############################################################
def build_table_metadata_snapshot(
    fq_table: str,
    uc_metadata: Dict[str, List[Dict[str, Any]]],
    describe_rows: List[Dict[str, Any]]
) -> Dict[str, Any]:
    catalog, schema, table = parse_fq_table(fq_table)
    # Table tags
    table_tags = {row["tag_name"]: row["tag_value"] for row in uc_metadata.get("table_tags", [])}
    # Table properties, owner, comment
    details = extract_table_details(describe_rows)
    # Table check constraints (if present in table_properties, or elsewhere)
    table_check_constraints = {
        k: {"name": k, "expression": v}
        for k, v in details.get("table_properties", {}).items()
        if k.startswith("delta.constraints")
    }
    # Row filters
    row_filters = [
        {"filter_name": row["filter_name"], "target_columns": row["target_columns"]}
        for row in uc_metadata.get("row_filters", [])
    ]
    # Partition columns
    partitioned_by = extract_partitioned_by(describe_rows)
    # Constraints
    constraints = extract_constraints(describe_rows)
    # Primary key: from constraints
    pk = []
    for c in constraints:
        if "PRIMARY KEY" in c["type"]:
            m = re.search(r"\((.*?)\)", c["type"])
            if m:
                pk = [col.strip().replace("`", "") for col in m.group(1).split(",")]
    # Columns (by index)
    columns_raw = extract_columns(describe_rows)
    # Column tags (merge by column name)
    col_tag_lookup = {}
    for row in uc_metadata.get("column_tags", []):
        col = row["column_name"]
        if col not in col_tag_lookup:
            col_tag_lookup[col] = {}
        col_tag_lookup[col][row["tag_name"]] = row["tag_value"]
    # Column check constraints (by constraint_column_usage)
    col_constraint_lookup = {}
    for row in uc_metadata.get("constraint_column_usage", []):
        col = row["column_name"]
        cons = row["constraint_name"]
        if col not in col_constraint_lookup:
            col_constraint_lookup[col] = {}
        col_constraint_lookup[col][cons] = {"name": cons}  # Expression requires deeper parsing if needed

    # Build columns dictionary by position (1-based, as in your spec)
    columns = {}
    for idx, col in enumerate(columns_raw, start=1):
        colname = col["name"]
        columns[idx] = {
            "column_name": colname,
            "datatype": col["datatype"],
            "comment": col["comment"],
            "nullable": None,  # Could be extracted if needed
            "masking_rule": None,  # Could be extracted if needed
            "column_tags": col_tag_lookup.get(colname, {}),
            "column_check_constraints": col_constraint_lookup.get(colname, {}),
        }

    result = {
        "table": {
            "fully_qualified_name": fq_table,
            "catalog": catalog,
            "schema": schema,
            "table": table,
            "owner": details.get("owner", ""),
            "comment": details.get("comment", ""),
            "table_properties": details.get("table_properties", {}),
            "table_tags": table_tags,
            "table_check_constraints": table_check_constraints,
            "row_filters": row_filters,
            "partitioned_by": partitioned_by,
            "primary_key": pk,
            "columns": columns,
        }
    }
    return result

# --- Usage Example ---
table_name = "dq_dev.lmg_sandbox.config_driven_table_example"
spark = SparkSession.getActiveSession() or SparkSession.builder.getOrCreate()
uc_metadata = get_metadata_snapshot(spark, table_name)
describe_rows = get_describe_rows(spark, table_name)

snapshot = build_table_metadata_snapshot(table_name, uc_metadata, describe_rows)
import pprint; pprint.pprint(snapshot, width=120)

############################################################
                    End of Second Section                    
############################################################

In [0]:
import re
from typing import Any, Dict, List, Optional
from pyspark.sql import SparkSession


class TableSnapshot:
    SNAPSHOT_QUERIES = {
        "table_tags": {
            "table": "system.information_schema.table_tags",
            "columns": ["catalog_name", "schema_name", "table_name", "tag_name", "tag_value"],
            "where_keys": [("catalog_name", 0), ("schema_name", 1), ("table_name", 2)],
        },
        "column_tags": {
            "table": "system.information_schema.column_tags",
            "columns": ["catalog_name", "schema_name", "table_name", "column_name", "tag_name", "tag_value"],
            "where_keys": [("catalog_name", 0), ("schema_name", 1), ("table_name", 2)],
        },
        "row_filters": {
            "table": "system.information_schema.row_filters",
            "columns": ["table_catalog", "table_schema", "table_name", "filter_name", "target_columns"],
            "where_keys": [("table_catalog", 0), ("table_schema", 1), ("table_name", 2)],
        },
        "constraint_table_usage": {
            "table": "system.information_schema.constraint_table_usage",
            "columns": ["constraint_catalog", "constraint_schema", "constraint_name"],
            "where_keys": [("table_catalog", 0), ("table_schema", 1), ("table_name", 2)],
        },
        "constraint_column_usage": {
            "table": "system.information_schema.constraint_column_usage",
            "columns": ["column_name", "constraint_name"],
            "where_keys": [("table_catalog", 0), ("table_schema", 1), ("table_name", 2)],
        },
    }

    def __init__(self, spark: SparkSession, fq_table: str):
        self.spark = spark
        self.fq_table = fq_table
        self.catalog, self.schema, self.table = self._parse_fq_table(fq_table)

    def _parse_fq_table(self, fq_table: str):
        parts = fq_table.split(".")
        if len(parts) != 3:
            raise ValueError("Expected format: catalog.schema.table")
        return parts[0], parts[1], parts[2]

    def _build_metadata_sql(self, kind: str) -> str:
        config = self.SNAPSHOT_QUERIES[kind]
        table_vars = [self.catalog, self.schema, self.table]
        where_clauses = [
            f"{col_name} = '{table_vars[idx]}'"
            for col_name, idx in config["where_keys"]
        ]
        columns = ", ".join(config["columns"])
        return f"SELECT {columns} FROM {config['table']} WHERE {' AND '.join(where_clauses)}"

    def _get_metadata_snapshot(self) -> Dict[str, List[Dict[str, Any]]]:
        results = {}
        for kind in self.SNAPSHOT_QUERIES:
            try:
                sql = self._build_metadata_sql(kind)
                df = self.spark.sql(sql)
                results[kind] = [row.asDict() for row in df.collect()]
            except Exception as e:
                results[kind] = []
        return results

    def _get_describe_rows(self) -> List[Dict[str, Any]]:
        sql = f"DESCRIBE EXTENDED {self.fq_table}"
        df = self.spark.sql(sql)
        return [row.asDict() for row in df.collect()]

    def _extract_columns(self, describe_rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        columns = []
        for row in describe_rows:
            col_name = (row.get("col_name") or "").strip()
            data_type = (row.get("data_type") or "").strip()
            comment = (row.get("comment") or "").strip() if row.get("comment") else ""
            if col_name == "" or col_name.startswith("#"):
                if col_name == "# Partition Information":
                    break
                continue
            columns.append({
                "name": col_name,
                "datatype": data_type,
                "comment": comment if comment.upper() != "NULL" else "",
            })
        return columns

    def _extract_partitioned_by(self, describe_rows: List[Dict[str, Any]]) -> List[str]:
        collecting = False
        partition_cols = []
        for row in describe_rows:
            col_name = (row.get("col_name") or "").strip()
            if col_name == "# Partition Information":
                collecting = True
                continue
            if collecting:
                if not col_name or col_name.startswith("#"):
                    break
                if col_name != "# col_name":
                    partition_cols.append(col_name)
        return partition_cols

    def _extract_table_details(self, describe_rows: List[Dict[str, Any]]) -> Dict[str, Any]:
        details = {}
        table_properties = {}
        in_details = False
        for row in describe_rows:
            col_name = (row.get("col_name") or "").strip()
            data_type = (row.get("data_type") or "").strip()
            if col_name == "# Detailed Table Information":
                in_details = True
                continue
            if in_details:
                if not col_name or col_name.startswith("#"):
                    break
                if col_name == "Owner":
                    details["owner"] = data_type
                elif col_name == "Comment":
                    details["comment"] = data_type
                elif col_name == "Table Properties":
                    for prop in data_type.strip("[]").split(","):
                        if "=" in prop:
                            k, v = prop.split("=", 1)
                            table_properties[k.strip()] = v.strip()
        details["table_properties"] = table_properties
        return details

    def _extract_constraints(self, describe_rows: List[Dict[str, Any]]) -> List[Dict[str, str]]:
        constraints = []
        in_constraints = False
        for row in describe_rows:
            col_name = (row.get("col_name") or "").strip()
            data_type = (row.get("data_type") or "").strip()
            if col_name == "# Constraints":
                in_constraints = True
                continue
            if in_constraints:
                if not col_name or col_name.startswith("#"):
                    break
                if col_name and data_type:
                    constraints.append({"name": col_name, "type": data_type})
        return constraints

    def _build_columns(self, columns_raw: List[Dict[str, Any]], col_tags: Dict[str, Dict[str, Any]], col_checks: Dict[str, Dict[str, Any]]) -> Dict[int, Dict[str, Any]]:
        columns = {}
        for idx, col in enumerate(columns_raw, start=1):
            name = col["name"]
            columns[idx] = {
                "name": name,
                "datatype": col["datatype"],
                "nullable": None,  # Could be filled with extended logic
                "active": True,
                "comment": col.get("comment", ""),
                "tags": col_tags.get(name, {}),
                "column_masking_rule": "",  # No masking in snapshot, set empty
                "column_check_constraints": col_checks.get(name, {}),
            }
        return columns

    def build_table_metadata_dict(self) -> Dict[str, Any]:
        # Pull data
        uc_metadata = self._get_metadata_snapshot()
        describe_rows = self._get_describe_rows()

        catalog, schema, table = self.catalog, self.schema, self.table

        # Parse table tags
        table_tags = {row["tag_name"]: row["tag_value"] for row in uc_metadata.get("table_tags", [])}

        # Extract table details
        details = self._extract_table_details(describe_rows)

        # Extract table check constraints (using table properties keys starting with delta.constraints)
        table_check_constraints = {
            k: {"name": k, "expression": v}
            for k, v in details.get("table_properties", {}).items()
            if k.startswith("delta.constraints")
        }

        # Row filters
        row_filters = {}
        for row in uc_metadata.get("row_filters", []):
            fname = row.get("filter_name")
            if fname:
                row_filters[fname] = {
                    "name": fname,
                    "expression": row.get("target_columns", "")
                }

        # Partition columns
        partitioned_by = self._extract_partitioned_by(describe_rows)

        # Constraints
        constraints = self._extract_constraints(describe_rows)

        # Primary key from constraints (parse SQL text)
        pk = []
        for c in constraints:
            if "PRIMARY KEY" in c["type"]:
                m = re.search(r"\((.*?)\)", c["type"])
                if m:
                    pk = [col.strip().replace("`", "") for col in m.group(1).split(",")]

        # Raw columns
        columns_raw = self._extract_columns(describe_rows)

        # Build column tags lookup
        col_tag_lookup = {}
        for row in uc_metadata.get("column_tags", []):
            col = row["column_name"]
            if col not in col_tag_lookup:
                col_tag_lookup[col] = {}
            col_tag_lookup[col][row["tag_name"]] = row["tag_value"]

        # Build column check constraints lookup
        col_constraint_lookup = {}
        for row in uc_metadata.get("constraint_column_usage", []):
            col = row["column_name"]
            cons = row["constraint_name"]
            if col not in col_constraint_lookup:
                col_constraint_lookup[col] = {}
            col_constraint_lookup[col][cons] = {"name": cons}  # no expression parsing here

        # Compose columns dict keyed by 1-based index
        columns = self._build_columns(columns_raw, col_tag_lookup, col_constraint_lookup)

        return {
            "full_table_name": self.fq_table,
            "catalog": catalog,
            "schema": schema,
            "table": table,
            "primary_key": pk,
            "foreign_keys": self._get_foreign_keys(uc_metadata),
            "unique_keys": self._get_unique_keys(uc_metadata),
            "partitioned_by": partitioned_by,
            "tags": table_tags,
            "row_filters": row_filters,
            "table_check_constraints": table_check_constraints,
            "table_properties": details.get("table_properties", {}),
            "comment": details.get("comment", ""),
            "owner": details.get("owner", ""),
            "columns": columns,
        }

    def _get_foreign_keys(self, uc_metadata: Dict[str, List[Dict[str, Any]]]) -> Dict[str, Any]:
        # Could be enhanced to parse foreign keys if available from metadata
        return {}

    def _get_unique_keys(self, uc_metadata: Dict[str, List[Dict[str, Any]]]) -> List[List[str]]:
        # Could be enhanced to parse unique keys if available from metadata
        return []


# Usage example:
# spark = SparkSession.getActiveSession() or SparkSession.builder.getOrCreate()
# table_snapshot = TableSnapshot(spark, "dq_dev.lmg_sandbox.config_driven_table_example")
# metadata_dict = table_snapshot.build_table_metadata_dict()
# pprint.pprint(metadata_dict, width=120)

In [0]:
# Usage example:
spark = SparkSession.getActiveSession() or SparkSession.builder.getOrCreate()
table_snapshot = TableSnapshot(spark, "dq_dev.lmg_sandbox.config_driven_table_example")
metadata_dict = table_snapshot.build_table_metadata_dict()
print(metadata_dict)
#pprint.pprint(metadata_dict, width=120)

#### Yaml

In [0]:
import yaml
from typing import Any, Dict, List, Optional

class TableSchemaConfig:
    """
    Loader for YAML DDL config files. Exposes all config blocks with a clean API.
    Handles dynamic env, catalog suffixes, and nested constraints/keys.
    """

    def __init__(self, config_path: str, env: Optional[str] = None):
        self.config_path = config_path
        self._env = env
        self._config: Dict[str, Any] = {}
        self.load_config()

    def load_config(self) -> None:
        try:
            with open(self.config_path, "r") as f:
                self._config = yaml.safe_load(f)
        except (FileNotFoundError, yaml.YAMLError) as e:
            raise ValueError(f"Error loading YAML configuration from {self.config_path}: {e}")

    @property
    def catalog(self) -> str:
        return self._config.get("catalog", "")

    @property
    def schema(self) -> str:
        return self._config.get("schema", "")

    @property
    def table(self) -> str:
        return self._config.get("table", "")

    @property
    def env(self) -> Optional[str]:
        return self._env

    @property
    def full_table_name(self) -> str:
        cat = self.catalog.strip()
        sch = self.schema.strip()
        tbl = self.table.strip()
        env = self.env
        if cat.endswith("_") and env:
            cat_full = f"{cat}{env}"
        else:
            cat_full = cat
        return f"{cat_full}.{sch}.{tbl}"

    @property
    def owner(self) -> str:
        return self._config.get("owner", "")

    @property
    def tags(self) -> Dict[str, Any]:
        return self._config.get("tags", {})

    @property
    def properties(self) -> Dict[str, Any]:
        return self._config.get("properties", {})

    @property
    def table_comment(self) -> str:
        return self.properties.get("comment", "")

    @property
    def table_properties(self) -> Dict[str, Any]:
        return self.properties.get("table_properties", {})

    @property
    def primary_key(self) -> List[str]:
        pk = self._config.get("primary_key", [])
        return pk if isinstance(pk, list) else [pk]

    @property
    def partitioned_by(self) -> List[str]:
        pb = self._config.get("partitioned_by", [])
        return pb if isinstance(pb, list) else [pb]

    @property
    def unique_keys(self) -> List[List[str]]:
        return self._config.get("unique_keys", [])

    @property
    def foreign_keys(self) -> Dict[str, Any]:
        return self._config.get("foreign_keys", {})

    @property
    def table_check_constraints(self) -> Dict[str, Any]:
        return self._config.get("table_check_constraints", {})

    @property
    def row_filters(self) -> Dict[str, Any]:
        return self._config.get("row_filters", {})

    @property
    def columns(self) -> List[Dict[str, Any]]:
        cols_dict = self._config.get("columns", {})
        cols_dict_str = {str(k): v for k, v in cols_dict.items()}
        sorted_keys = sorted(map(int, cols_dict_str.keys()))
        return [cols_dict_str[str(k)] for k in sorted_keys]

    def build_table_metadata_dict(self) -> Dict[str, Any]:
        # Return dict with keys in the exact order you want — relies on Python 3.7+ insertion order preservation
        result = {
            "full_table_name": self.full_table_name,
            "catalog": self.catalog,
            "schema": self.schema,
            "table": self.table,
            "primary_key": self.primary_key if self.primary_key else [],
            "foreign_keys": self.foreign_keys if self.foreign_keys else {},
            "unique_keys": self.unique_keys if self.unique_keys else [],
            "partitioned_by": self.partitioned_by if self.partitioned_by else [],
            "tags": self.tags if self.tags else {},
            "row_filters": self.row_filters if self.row_filters else {},
            "table_check_constraints": self.table_check_constraints if self.table_check_constraints else {},
            "table_properties": self.table_properties if self.table_properties else {},
            "comment": self.table_comment,
            "owner": self.owner,
            "columns": {},
        }

        # Numbered columns with requested keys & order
        for idx, col in enumerate(self.columns, 1):
            result["columns"][idx] = {
                "name": col.get("name", ""),
                "datatype": col.get("datatype", ""),
                "nullable": col.get("nullable", True),
                "active": col.get("active", True),
                "comment": col.get("comment", ""),
                "tags": col.get("tags", {}),
                "column_masking_rule": col.get("column_masking_rule", ""),
                "column_check_constraints": col.get("column_check_constraints", {}),
            }

        return result

    def describe(self) -> None:
        # Helper for dev/test use only
        print(f"Table: {self.full_table_name}")
        print(f"  Owner: {self.owner}")
        print(f"  Tags: {self.tags}")
        print(f"  Primary Key: {self.primary_key}")
        print(f"  Partitioned By: {self.partitioned_by}")
        print(f"  Unique Keys: {self.unique_keys}")
        print(f"  Foreign Keys: {self.foreign_keys}")
        print(f"  Table Check Constraints: {self.table_check_constraints}")
        print(f"  Row Filters: {self.row_filters}")
        print(f"  Table Properties: {self.table_properties}")
        print(f"  Columns:")
        for i, col in enumerate(self.columns, 1):
            print(
                f"    {i}: {col.get('name','')} ({col.get('datatype','')}, nullable={col.get('nullable', True)}) | "
                f"comment={col.get('comment','')}, tags={col.get('tags',{})}, active={col.get('active', True)}"
            )
            ccc = col.get("column_check_constraints", {})
            if ccc:
                print(f"      Column Check Constraints: {ccc}")

In [0]:

yaml_path = "layker/resources/example.yaml"

cfg = TableSchemaConfig(yaml_path, env="dev")
table_meta = cfg.build_table_metadata_dict()
print(table_meta)

#import pprint
#pprint.pprint(table_meta, width=120)