In [0]:
from pyspark.sql import functions as F
dim_customer = spark.read.format("delta").table("workspace.gold.dim_customers")
dim_product = spark.read.format("delta").table("workspace.gold.dim_products")
fact_sales = spark.read.format("delta").table("workspace.gold.fact_sales")

# ==========================================
# TEST 1: Completeness (No Null Primary Keys)
# ==========================================
print("Running Test 1: Completeness...")

null_customers = dim_customer.filter(F.col("customer_id").isNull()).count()
null_products = dim_product.filter(F.col("product_id").isNull()).count()

assert null_customers == 0, f"Completeness Failed: Found {null_customers} null cst_id(s) in dim_customer!"
assert null_products == 0, f"Completeness Failed: Found {null_products} null prd_id(s) in dim_product!"

In [0]:
print("Running Test 2: Uniqueness...")
total_cust_rows = dim_customer.count()
distinct_cust_keys = dim_customer.select("customer_id").distinct().count()
assert total_cust_rows == distinct_cust_keys, f"Uniqueness Failed: dim_customer has duplicate customer_id! Total rows: {total_cust_rows}, Distinct IDs: {distinct_cust_keys}"

In [0]:
print("Running Test 3: Validity...")
# Check if there are any negative sales in the fact table
negative_sales = fact_sales.filter(F.col("sales_amount") < 0).count()
assert negative_sales == 0, f"Validity Failed: Found {negative_sales} row(s) in fact_sales with negative sales!"

# Check if there are any negative costs in the product table
negative_costs = fact_sales.filter(F.col("price") < 0).count()
assert negative_costs == 0, f"Validity Failed: Found {negative_costs} row(s) in fact_sales with negative product cost!"

In [0]:
print("Running Test 4: Volume...")

sales_row_count = fact_sales.count()
assert sales_row_count > 0, "Volume Failed: fact_sales table is completely empty!"

print("âœ… All Data Quality Checks Passed Successfully! Data is ready for BI consumption.")