In [0]:
dbutils.widgets.text("dataset_name", "")
dbutils.widgets.text("bronze_path", "")

dataset_name = dbutils.widgets.get("dataset_name")
bronze_base_path = dbutils.widgets.get("bronze_path")

path = f"{bronze_base_path.rstrip('/')}/{dataset_name}/"
df = spark.read.format("parquet").load(path)

# normalize column names
df = df.toDF(*[c.lower() for c in df.columns])

In [0]:
current_schema = [(f.name, f.dataType.simpleString()) for f in df.schema.fields]

In [0]:
from pyspark.sql.functions import col

prev_schema_df = spark.table("config_catalog.audit_tabales.dq_schema_null_audit") \
    .filter(col("dataset_name") == dataset_name) \
    .select("column_name") \
    .distinct()

In [0]:
current_cols = set([c[0] for c in current_schema])
previous_cols = set([r.column_name for r in prev_schema_df.collect()])

new_columns = current_cols - previous_cols
removed_columns = previous_cols - current_cols

In [0]:
from pyspark.sql.functions import count, when, lit, current_timestamp, input_file_name, current_date
from datetime import datetime, date

total_count = df.count()

audit_rows = []

for col_name, dtype in current_schema:
    null_count = df.filter(col(col_name).isNull()).count()

    audit_rows.append((
    dataset_name,
    col_name,
    dtype,
    null_count,
    total_count,
    col_name in new_columns,
    col_name in removed_columns,
    date.today(),          # ✅ Python date
    datetime.now(),        # ✅ Python datetime
    None
))

In [0]:
from pyspark.sql.types import *

schema = StructType([
    StructField("dataset_name", StringType(), True),
    StructField("column_name", StringType(), True),
    StructField("data_type", StringType(), True),
    StructField("null_count", LongType(), True),
    StructField("total_count", LongType(), True),
    StructField("is_new_column", BooleanType(), True),
    StructField("is_removed_column", BooleanType(), True),
    StructField("audit_date", DateType(), True),
    StructField("ingestion_time", TimestampType(), True),
    StructField("source_file", StringType(), True)
])

In [0]:
if len(audit_rows) == 0:
    print("No audit rows generated")
else:
    audit_df = spark.createDataFrame(audit_rows, schema)

In [0]:
audit_df.write.format("delta") \
    .mode("append") \
    .saveAsTable("config_catalog.audit_tabales.dq_schema_null_audit")