# Data Validation and Cross-Reference Checks 

**This notebook performs comprehensive daa validation across all staging tables and applies business rules**

In [0]:
orders_stage = "`event-driven-catalog`.default.order_stage"
customers_stage = "`event-driven-catalog`.default.customers_stage"
products_stage = "`event-driven-catalog`.default.products_stage"
inventory_stage = "`event-driven-catalog`.default.inventory_stage"
shipping_stage = "`event-driven-catalog`.default.shipping_stage"
validation_results_table = "`event-driven-catalog`.default.validation_results"

print("Starting comprehensive data validation process...")

Starting comprehensive data validation process...


In [0]:
# Import required libraries
from pyspark.sql import functions as F
from pyspark.sql.types import *
from datetime import datetime
import json

# Read all staging tables
try:
    df_orders = spark.read.table(orders_stage)
    df_customers = spark.read.table(customers_stage)
    df_products = spark.read.table(products_stage)
    df_inventory = spark.read.table(inventory_stage)
    df_shipping = spark.read.table(shipping_stage)
    
    print("Successfully loaded all staging tables")
    print(f"Orders: {df_orders.count()} records")
    print(f"Customers: {df_customers.count()} records")
    print(f"Products: {df_products.count()} records")
    print(f"Inventory: {df_inventory.count()} records")
    print(f"Shipping: {df_shipping.count()} records")
    
except Exception as e:
    print(f"Error loading staging tables: {str(e)}")
    raise


Successfully loaded all staging tables
Orders: 20 records
Customers: 20 records
Products: 20 records
Inventory: 20 records
Shipping: 20 records


In [0]:
# Cross-reference validation: Orders vs Customers
try:
    # Check for orphaned orders (orders without valid customers)
    orphaned_orders = df_orders.join(df_customers, "customer_id", "left_anti")
    orphaned_orders_count = orphaned_orders.count()
    
    # Check for orphaned customers (customers without any orders)
    orphaned_customers = df_customers.join(df_orders, "customer_id", "left_anti")
    orphaned_customers_count = orphaned_customers.count()
    
    print(f"Orphaned orders (no valid customer): {orphaned_orders_count}")
    print(f"Orphaned customers (no orders): {orphaned_customers_count}")
    
    # Validate order amounts are reasonable
    unreasonable_orders = df_orders.filter(
        (F.col("order_amount") < 1) | (F.col("order_amount") > 10000)
    )
    unreasonable_orders_count = unreasonable_orders.count()
    
    print(f"Orders with unreasonable amounts: {unreasonable_orders_count}")
    
except Exception as e:
    print(f"Error in orders-customers validation: {str(e)}")
    raise


Orphaned orders (no valid customer): 0
Orphaned customers (no orders): 0
Orders with unreasonable amounts: 0


In [0]:
# Cross-reference validation: Orders vs Products
try:
    # Check for orphaned orders (orders without valid products)
    orphaned_orders_products = df_orders.join(df_products, "product_id", "left_anti")
    orphaned_orders_products_count = orphaned_orders_products.count()
    
    # Check for orphaned products (products without any orders)
    orphaned_products = df_products.join(df_orders, "product_id", "left_anti")
    orphaned_products_count = orphaned_products.count()
    
    print(f"Orders with invalid products: {orphaned_orders_products_count}")
    print(f"Products without orders: {orphaned_products_count}")
    
except Exception as e:
    print(f"Error in orders-products validation: {str(e)}")
    raise


Orders with invalid products: 0
Products without orders: 0


In [0]:
# Cross-reference validation: Orders vs Shipping
try:
    # Check for orders without shipping information
    orders_without_shipping = df_orders.join(df_shipping, "order_id", "left_anti")
    orders_without_shipping_count = orders_without_shipping.count()
    
    # Check for shipping without orders
    shipping_without_orders = df_shipping.join(df_orders, "order_id", "left_anti")
    shipping_without_orders_count = shipping_without_orders.count()
    
    print(f"Orders without shipping: {orders_without_shipping_count}")
    print(f"Shipping without orders: {shipping_without_orders_count}")
    
    # Validate shipping costs are reasonable
    unreasonable_shipping = df_shipping.filter(
        (F.col("shipping_cost") < 0) | (F.col("shipping_cost") > 100)
    )
    unreasonable_shipping_count = unreasonable_shipping.count()
    
    print(f"Shipping with unreasonable costs: {unreasonable_shipping_count}")
    
except Exception as e:
    print(f"Error in orders-shipping validation: {str(e)}")
    raise


Orders without shipping: 0
Shipping without orders: 0
Shipping with unreasonable costs: 0


In [0]:
# Cross-reference validation: Products vs Inventory
try:
    # Check for products without inventory
    products_without_inventory = df_products.join(df_inventory, "product_id", "left_anti")
    products_without_inventory_count = products_without_inventory.count()
    
    # Check for inventory without products
    inventory_without_products = df_inventory.join(df_products, "product_id", "left_anti")
    inventory_without_products_count = inventory_without_products.count()
    
    print(f"Products without inventory: {products_without_inventory_count}")
    print(f"Inventory without products: {inventory_without_products_count}")
    
except Exception as e:
    print(f"Error in products-inventory validation: {str(e)}")
    raise


Products without inventory: 0
Inventory without products: 0


In [0]:
# Business Rules Validation
try:
    # Rule 1: Premium customers should have higher order values
    premium_customers_orders = df_orders.join(df_customers, "customer_id", "inner") \
                                       .filter(F.col("customer_tier") == "premium")
    
    low_value_premium_orders = premium_customers_orders.filter(F.col("order_amount") < 100)
    low_value_premium_count = low_value_premium_orders.count()
    
    # Rule 2: Orders should be processed within business hours (8 AM - 6 PM)
    orders_outside_hours = df_orders.filter(
        (F.hour(F.col("created_timestamp")) < 8) | 
        (F.hour(F.col("created_timestamp")) > 18)
    )
    orders_outside_hours_count = orders_outside_hours.count()

    
    print(f"Premium customers with low-value orders: {low_value_premium_count}")
    print(f"Orders outside business hours: {orders_outside_hours_count}")
    
except Exception as e:
    print(f"Error in business rules validation: {str(e)}")
    raise


Premium customers with low-value orders: 1
Orders outside business hours: 9
