# Data Quality Validation

This notebook runs comprehensive data quality checks on all tables.

## Validation Categories
- Duplicate detection
- Value validation (NULLs, invalid enums, impossible values)
- Temporal inconsistencies
- Referential integrity
- Business logic validation


In [None]:
import pandas as pd
import sqlite3

# Connect to database
db_path = "data/ecommerce.db"
conn = sqlite3.connect(db_path)
print(f"Connected to database: {db_path}")


## Run Orders Validation Checks


In [None]:
# Read and execute orders validation checks
with open("sql/validation/orders_checks.sql", "r") as f:
    orders_checks = f.read()

# Split by comments to run individual checks
checks = []
current_check = ""
for line in orders_checks.split('\n'):
    if line.strip().startswith('-- Check'):
        if current_check:
            checks.append(current_check)
        current_check = line + '\n'
    elif current_check:
        current_check += line + '\n'
if current_check:
    checks.append(current_check)

# Run each check
print("=" * 60)
print("ORDERS VALIDATION CHECKS")
print("=" * 60)
for i, check in enumerate(checks, 1):
    if 'SELECT' in check.upper():
        try:
            result = pd.read_sql_query(check, conn)
            check_name = [l for l in check.split('\n') if l.strip().startswith('-- Check')][0] if any(l.strip().startswith('-- Check') for l in check.split('\n')) else f"Check {i}"
            print(f"\n{check_name}")
            print(f"Rows returned: {len(result)}")
            if len(result) > 0:
                print("⚠️  ISSUE DETECTED:")
                print(result.head(10))
            else:
                print("✓ Passed")
        except Exception as e:
            print(f"Error running check: {e}")


In [None]:
# Read and execute customers validation checks
with open("sql/validation/customers_checks.sql", "r") as f:
    customers_checks = f.read()

# Extract and run SELECT queries
import re
select_queries = re.findall(r'SELECT.*?;', customers_checks, re.DOTALL | re.IGNORECASE)

print("=" * 60)
print("CUSTOMERS VALIDATION CHECKS")
print("=" * 60)
for i, query in enumerate(select_queries, 1):
    try:
        result = pd.read_sql_query(query, conn)
        print(f"\nCheck {i}")
        print(f"Rows returned: {len(result)}")
        if len(result) > 0:
            print("⚠️  ISSUE DETECTED:")
            print(result.head(10))
        else:
            print("✓ Passed")
    except Exception as e:
        print(f"Error: {e}")


## Run Deliveries Validation Checks


In [None]:
# Read and execute deliveries validation checks
with open("sql/validation/deliveries_checks.sql", "r") as f:
    deliveries_checks = f.read()

# Extract and run SELECT queries
select_queries = re.findall(r'SELECT.*?;', deliveries_checks, re.DOTALL | re.IGNORECASE)

print("=" * 60)
print("DELIVERIES VALIDATION CHECKS")
print("=" * 60)
for i, query in enumerate(select_queries, 1):
    try:
        result = pd.read_sql_query(query, conn)
        print(f"\nCheck {i}")
        print(f"Rows returned: {len(result)}")
        if len(result) > 0:
            print("⚠️  ISSUE DETECTED:")
            print(result.head(10))
        else:
            print("✓ Passed")
    except Exception as e:
        print(f"Error: {e}")


## Summary

All validation checks completed. Review any issues detected above.


In [None]:
conn.close()
print("Database connection closed")
