# 02_metadata_layer
Purpose: Load metadata, validate Bronze tables, apply rules, generate a quality report.
Author: Janak  


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from delta import *
import json
import os

spark = SparkSession.builder.getOrCreate()

bronze_base = "/tmp/delta/bronze"
metadata_path = "src/metadata/metadata_schema.json"

print("Spark initialized, Bronze path:", bronze_base)


In [0]:
# Load metadata JSON
with open(metadata_path, "r") as f:
    metadata = json.load(f)

metadata


In [0]:
# Show high-level metadata for all tables
for table, details in metadata.items():
    print(f"Table: {table}")
    print("  Columns:", list(details['schema'].keys()))
    print("  Owner:", details.get("owner"))
    print("  Freshness Hours:", details.get("freshness_hours"))
    print("  Quality Rules:", list(details.get("quality_rules", {}).keys()))
    print("-" * 50)


In [0]:
bronze_dfs = {}

for table in metadata.keys():
    delta_path = f"{bronze_base}/{table}"
    print(f"Loading Bronze table: {table} from {delta_path}")
    df = spark.read.format("delta").load(delta_path)
    bronze_dfs[table] = df
    display(df.limit(5))


In [0]:
def validate_not_null(df, cols):
    results = {}
    for c in cols:
        null_count = df.filter(df[c].isNull()).count()
        results[c] = null_count
    return results

def validate_positive(df, cols):
    results = {}
    for c in cols:
        neg_count = df.filter(col(c) < 0).count()
        results[c] = neg_count
    return results

def validate_enum(df, enum_dict):
    results = {}
    for col_name, accepted_values in enum_dict.items():
        invalid_count = df.filter(~col(col_name).isin(accepted_values)).count()
        results[col_name] = invalid_count
    return results

def cast_schema(df, schema_dict):
    for col_name, dtype in schema_dict.items():
        df = df.withColumn(col_name, col(col_name).cast(dtype))
    return df


In [0]:
validation_results = {}

for table, meta in metadata.items():
    print(f"Validating table: {table}")
    df = bronze_dfs[table]

    # Schema Enforcement
    expected_schema = meta["schema"]
    df_casted = cast_schema(df, expected_schema)

    # Apply Rules
    rules = meta.get("quality_rules", {})
    table_results = {}

    if "not_null" in rules:
        table_results["not_null"] = validate_not_null(df_casted, rules["not_null"])

    if "positive_values" in rules:
        table_results["positive_values"] = validate_positive(df_casted, rules["positive_values"])

    if "accepted_values" in rules:
        table_results["accepted_values"] = validate_enum(df_casted, rules["accepted_values"])

    validation_results[table] = table_results
    print("Validation:", table_results)
    print("-" * 80)


In [0]:
# Normalize results (convert nested dicts to rows)
rows = []

for table, checks in validation_results.items():
    for rule_type, result_dict in checks.items():
        for col_name, issue_count in result_dict.items():
            rows.append((table, rule_type, col_name, issue_count))

validation_df = spark.createDataFrame(
    rows,
    ["table_name", "rule_type", "column_name", "issue_count"]
)

display(validation_df)


In [0]:
quality_report_path = "/tmp/delta/quality_report"

validation_df.write.format("delta").mode("overwrite").save(quality_report_path)
print("Saved quality report to:", quality_report_path)


In [0]:
print("ðŸ”¥ QUALITY REPORT SUMMARY ðŸ”¥")
summary = (
    validation_df.groupBy("table_name", "rule_type")
    .agg(sum("issue_count").alias("total_issues"))
    .orderBy(desc("total_issues"))
)

display(summary)


In [0]:
validation_df.createOrReplaceTempView("quality_report_view")

spark.sql("""
SELECT table_name, rule_type, column_name, issue_count
FROM quality_report_view
ORDER BY issue_count DESC
""").show()
