# Test Suite: Bronze Job Registration and Load Manifest

**Purpose:** Validate bronze.load_jobs metadata registry population and configuration

**Scope:**
- Job registration completeness (all bronze tables registered)
- File path correctness (proper formatting, no trailing slashes)
- Load order sequencing (CRM before ERP)
- Table discovery accuracy
- Configuration dependency validation
- Idempotency verification

**Testing Strategy:**
- Registration validation (all tables have job records)
- Path validation (correct base paths, proper CSV extensions)
- Order validation (CRM = 0-999, ERP = 1000+)
- Naming convention validation (source_dataset pattern)
- Configuration validation (etl_config values used correctly)
- Idempotency validation (re-runs don't create duplicates)

**Prerequisites:**
- PostgreSQL server running
- sql_retail_analytics_warehouse database exists
- bronze schema exists
- `setup/seed/02_register_bronze_jobs.sql` has been executed
- Bronze tables created (crm_*, erp_*)
- Connection credentials available
- Required packages: psycopg2, pytest, ipytest, pandas

## Setup: Import Dependencies & Configure Connection

In [None]:
import os
import psycopg2
from psycopg2 import sql
import pytest
import ipytest
import pandas as pd
import re

# Configure ipytest for notebook usage
ipytest.autoconfig()

# Database connection parameters
DB_CONFIG = {
    'host': 'localhost',
    'database': 'sql_retail_analytics_warehouse',
    'user': 'postgres',
    'password': os.getenv('POSTGRES_PASSWORD', 'your_password_here')
}

# Expected source systems
EXPECTED_SOURCES = ['crm', 'erp']

# Load order boundaries
CRM_MAX_ORDER = 999
ERP_MIN_ORDER = 1000

print("✅ Dependencies imported successfully")

## Fixtures: Database Connections

In [None]:
@pytest.fixture(scope='module')
def db_connection():
    """Connection to sql_retail_analytics_warehouse database."""
    conn = psycopg2.connect(**DB_CONFIG)
    conn.autocommit = True
    yield conn
    conn.close()

@pytest.fixture(scope='module')
def db_cursor(db_connection):
    """Cursor for warehouse database."""
    cursor = db_connection.cursor()
    yield cursor
    cursor.close()

print("✅ Fixtures defined")

## Test Suite 1: Job Registration Existence

In [None]:
%%ipytest -vv

def test_load_jobs_table_exists(db_cursor):
    """Verify bronze.load_jobs table exists."""
    db_cursor.execute("""
        SELECT COUNT(*)
        FROM information_schema.tables
        WHERE table_schema = 'bronze'
        AND table_name = 'load_jobs'
    """)
    
    count = db_cursor.fetchone()[0]
    assert count == 1, "bronze.load_jobs table must exist"

def test_jobs_were_registered(db_cursor):
    """Verify at least one job was registered."""
    db_cursor.execute("""
        SELECT COUNT(*) FROM bronze.load_jobs
    """)
    
    count = db_cursor.fetchone()[0]
    assert count > 0, "At least one job should be registered"

def test_all_bronze_tables_have_jobs(db_cursor):
    """Verify all bronze data tables have corresponding job records."""
    # Get all bronze tables (excluding system tables)
    db_cursor.execute("""
        SELECT table_name
        FROM information_schema.tables
        WHERE table_schema = 'bronze'
        AND table_type = 'BASE TABLE'
        AND table_name NOT IN ('load_jobs', 'load_log')
        ORDER BY table_name
    """)
    
    bronze_tables = [f"bronze.{row[0]}" for row in db_cursor.fetchall()]
    
    # Get all registered jobs
    db_cursor.execute("""
        SELECT table_name
        FROM bronze.load_jobs
        ORDER BY table_name
    """)
    
    registered_jobs = [row[0] for row in db_cursor.fetchall()]
    
    # Every bronze table should have a job
    missing_jobs = set(bronze_tables) - set(registered_jobs)
    assert len(missing_jobs) == 0, \
        f"These bronze tables are missing job records: {missing_jobs}"

def test_no_extra_jobs_for_nonexistent_tables(db_cursor):
    """Verify no job records exist for tables that don't exist."""
    db_cursor.execute("""
        SELECT lj.table_name
        FROM bronze.load_jobs lj
        WHERE NOT EXISTS (
            SELECT 1
            FROM information_schema.tables t
            WHERE format('%I.%I', t.table_schema, t.table_name) = lj.table_name
        )
    """)
    
    orphaned_jobs = [row[0] for row in db_cursor.fetchall()]
    assert len(orphaned_jobs) == 0, \
        f"These jobs reference non-existent tables: {orphaned_jobs}"

## Test Suite 2: File Path Validation

In [None]:
%%ipytest -vv

def test_all_enabled_jobs_have_file_paths(db_cursor):
    """Verify all enabled jobs have non-empty file paths."""
    db_cursor.execute("""
        SELECT table_name
        FROM bronze.load_jobs
        WHERE is_enabled = TRUE
        AND (file_path IS NULL OR TRIM(file_path) = '')
    """)
    
    missing_paths = [row[0] for row in db_cursor.fetchall()]
    assert len(missing_paths) == 0, \
        f"These enabled jobs have missing file paths: {missing_paths}"

def test_file_paths_end_with_csv(db_cursor):
    """Verify all file paths end with .csv extension."""
    db_cursor.execute("""
        SELECT table_name, file_path
        FROM bronze.load_jobs
        WHERE file_path IS NOT NULL
        AND NOT file_path ILIKE '%.csv'
    """)
    
    invalid_extensions = db_cursor.fetchall()
    assert len(invalid_extensions) == 0, \
        f"These jobs have invalid file extensions: {invalid_extensions}"

def test_file_paths_no_trailing_slashes_before_filename(db_cursor):
    """Verify file paths don't have double slashes (//filename.csv)."""
    db_cursor.execute("""
        SELECT table_name, file_path
        FROM bronze.load_jobs
        WHERE file_path LIKE '%//%'
    """)
    
    double_slash_paths = db_cursor.fetchall()
    assert len(double_slash_paths) == 0, \
        f"These jobs have double slashes in paths: {double_slash_paths}"

def test_file_paths_use_correct_base_paths(db_cursor):
    """Verify file paths start with correct base paths from etl_config."""
    # Get base paths from config
    db_cursor.execute("""
        SELECT
            MAX(CASE WHEN config_key = 'base_path_crm' THEN config_value END) AS base_crm,
            MAX(CASE WHEN config_key = 'base_path_erp' THEN config_value END) AS base_erp
        FROM public.etl_config
    """)
    
    base_crm, base_erp = db_cursor.fetchone()
    
    # Verify CRM jobs use CRM base path
    db_cursor.execute("""
        SELECT table_name, file_path
        FROM bronze.load_jobs
        WHERE table_name LIKE 'bronze.crm_%'
        AND NOT file_path LIKE %s
    """, (f"{base_crm}%",))
    
    wrong_crm_paths = db_cursor.fetchall()
    assert len(wrong_crm_paths) == 0, \
        f"CRM jobs should use base path '{base_crm}': {wrong_crm_paths}"
    
    # Verify ERP jobs use ERP base path
    db_cursor.execute("""
        SELECT table_name, file_path
        FROM bronze.load_jobs
        WHERE table_name LIKE 'bronze.erp_%'
        AND NOT file_path LIKE %s
    """, (f"{base_erp}%",))
    
    wrong_erp_paths = db_cursor.fetchall()
    assert len(wrong_erp_paths) == 0, \
        f"ERP jobs should use base path '{base_erp}': {wrong_erp_paths}"

def test_file_paths_match_table_naming_convention(db_cursor):
    """Verify file paths follow source_dataset.csv → dataset.csv mapping."""
    db_cursor.execute("""
        SELECT table_name, file_path
        FROM bronze.load_jobs
        ORDER BY table_name
    """)
    
    for table_name, file_path in db_cursor.fetchall():
        # Extract dataset from table name (bronze.source_dataset)
        match = re.match(r'bronze\.(crm|erp)_(.*)', table_name)
        if match:
            source, dataset = match.groups()
            expected_filename = f"{dataset}.csv"
            
            # File path should end with dataset.csv
            assert file_path.endswith(expected_filename), \
                f"Table '{table_name}' should map to file ending with '{expected_filename}', got: {file_path}"

## Test Suite 3: Load Order Sequencing

In [None]:
%%ipytest -vv

def test_load_order_exists_for_all_jobs(db_cursor):
    """Verify all jobs have a load_order assigned."""
    db_cursor.execute("""
        SELECT table_name
        FROM bronze.load_jobs
        WHERE load_order IS NULL
    """)
    
    missing_order = [row[0] for row in db_cursor.fetchall()]
    assert len(missing_order) == 0, \
        f"These jobs are missing load_order: {missing_order}"

def test_crm_tables_have_low_load_order(db_cursor):
    """Verify CRM tables have load_order 0-999."""
    db_cursor.execute("""
        SELECT table_name, load_order
        FROM bronze.load_jobs
        WHERE table_name LIKE 'bronze.crm_%'
        AND (load_order < 0 OR load_order > %s)
    """, (CRM_MAX_ORDER,))
    
    invalid_crm_order = db_cursor.fetchall()
    assert len(invalid_crm_order) == 0, \
        f"CRM tables should have load_order 0-{CRM_MAX_ORDER}: {invalid_crm_order}"

def test_erp_tables_have_high_load_order(db_cursor):
    """Verify ERP tables have load_order 1000+."""
    db_cursor.execute("""
        SELECT table_name, load_order
        FROM bronze.load_jobs
        WHERE table_name LIKE 'bronze.erp_%'
        AND load_order < %s
    """, (ERP_MIN_ORDER,))
    
    invalid_erp_order = db_cursor.fetchall()
    assert len(invalid_erp_order) == 0, \
        f"ERP tables should have load_order >= {ERP_MIN_ORDER}: {invalid_erp_order}"

def test_crm_loads_before_erp(db_cursor):
    """Verify all CRM jobs have lower load_order than all ERP jobs."""
    db_cursor.execute("""
        SELECT MAX(load_order) AS max_crm_order
        FROM bronze.load_jobs
        WHERE table_name LIKE 'bronze.crm_%'
    """)
    
    max_crm_order = db_cursor.fetchone()[0]
    
    db_cursor.execute("""
        SELECT MIN(load_order) AS min_erp_order
        FROM bronze.load_jobs
        WHERE table_name LIKE 'bronze.erp_%'
    """)
    
    min_erp_order = db_cursor.fetchone()[0]
    
    if max_crm_order is not None and min_erp_order is not None:
        assert max_crm_order < min_erp_order, \
            f"All CRM jobs (max={max_crm_order}) should load before ERP jobs (min={min_erp_order})"

def test_no_duplicate_load_orders(db_cursor):
    """Verify each load_order value is unique (no ties)."""
    db_cursor.execute("""
        SELECT load_order, COUNT(*) as count
        FROM bronze.load_jobs
        GROUP BY load_order
        HAVING COUNT(*) > 1
    """)
    
    duplicates = db_cursor.fetchall()
    assert len(duplicates) == 0, \
        f"These load_order values are duplicated: {duplicates}"

## Test Suite 4: Table Naming Convention Compliance

In [None]:
%%ipytest -vv

def test_all_jobs_follow_naming_pattern(db_cursor):
    """Verify all job table names follow bronze.source_dataset pattern."""
    db_cursor.execute("""
        SELECT table_name
        FROM bronze.load_jobs
    """)
    
    for (table_name,) in db_cursor.fetchall():
        # Should match bronze.{source}_{dataset}
        assert re.match(r'^bronze\.(crm|erp)_.+$', table_name), \
            f"Table name '{table_name}' doesn't follow bronze.source_dataset pattern"

def test_table_names_have_schema_qualifier(db_cursor):
    """Verify all table names are schema-qualified (bronze.tablename)."""
    db_cursor.execute("""
        SELECT table_name
        FROM bronze.load_jobs
        WHERE table_name NOT LIKE 'bronze.%'
    """)
    
    unqualified = [row[0] for row in db_cursor.fetchall()]
    assert len(unqualified) == 0, \
        f"These table names are not schema-qualified: {unqualified}"

def test_only_expected_source_systems(db_cursor):
    """Verify only CRM and ERP source systems are registered."""
    db_cursor.execute("""
        SELECT DISTINCT
            SPLIT_PART(REPLACE(table_name, 'bronze.', ''), '_', 1) AS source
        FROM bronze.load_jobs
        ORDER BY source
    """)
    
    sources = [row[0] for row in db_cursor.fetchall()]
    unexpected = set(sources) - set(EXPECTED_SOURCES)
    
    assert len(unexpected) == 0, \
        f"Unexpected source systems found: {unexpected}. Expected: {EXPECTED_SOURCES}"

## Test Suite 5: Configuration Dependency Validation

In [None]:
%%ipytest -vv

def test_etl_config_table_exists(db_cursor):
    """Verify public.etl_config table exists (required dependency)."""
    db_cursor.execute("""
        SELECT COUNT(*)
        FROM information_schema.tables
        WHERE table_schema = 'public'
        AND table_name = 'etl_config'
    """)
    
    count = db_cursor.fetchone()[0]
    assert count == 1, "public.etl_config table must exist"

def test_base_path_crm_configured(db_cursor):
    """Verify base_path_crm is configured in etl_config."""
    db_cursor.execute("""
        SELECT config_value
        FROM public.etl_config
        WHERE config_key = 'base_path_crm'
    """)
    
    result = db_cursor.fetchone()
    assert result is not None, "base_path_crm must be configured in etl_config"
    assert result[0] is not None, "base_path_crm value cannot be NULL"
    assert len(result[0].strip()) > 0, "base_path_crm cannot be empty"

def test_base_path_erp_configured(db_cursor):
    """Verify base_path_erp is configured in etl_config."""
    db_cursor.execute("""
        SELECT config_value
        FROM public.etl_config
        WHERE config_key = 'base_path_erp'
    """)
    
    result = db_cursor.fetchone()
    assert result is not None, "base_path_erp must be configured in etl_config"
    assert result[0] is not None, "base_path_erp value cannot be NULL"
    assert len(result[0].strip()) > 0, "base_path_erp cannot be empty"

def test_base_paths_no_trailing_slashes(db_cursor):
    """Verify base paths in etl_config don't have trailing slashes (convention)."""
    db_cursor.execute("""
        SELECT config_key, config_value
        FROM public.etl_config
        WHERE config_key IN ('base_path_crm', 'base_path_erp')
        AND config_value LIKE '%/'
    """)
    
    trailing_slashes = db_cursor.fetchall()
    assert len(trailing_slashes) == 0, \
        f"Base paths should NOT have trailing slashes: {trailing_slashes}"

def test_seed_load_jobs_procedure_exists(db_cursor):
    """Verify setup.seed_load_jobs() procedure exists."""
    db_cursor.execute("""
        SELECT COUNT(*)
        FROM pg_proc p
        JOIN pg_namespace n ON p.pronamespace = n.oid
        WHERE n.nspname = 'setup'
        AND p.proname = 'seed_load_jobs'
    """)
    
    count = db_cursor.fetchone()[0]
    assert count > 0, "setup.seed_load_jobs() procedure must exist"

## Test Suite 6: Job Metadata Quality

In [None]:
%%ipytest -vv

def test_no_duplicate_table_names(db_cursor):
    """Verify no duplicate table entries in load_jobs."""
    db_cursor.execute("""
        SELECT table_name, COUNT(*) as count
        FROM bronze.load_jobs
        GROUP BY table_name
        HAVING COUNT(*) > 1
    """)
    
    duplicates = db_cursor.fetchall()
    assert len(duplicates) == 0, \
        f"Duplicate table entries found: {duplicates}"

def test_is_enabled_is_boolean(db_cursor):
    """Verify is_enabled column contains only boolean values."""
    db_cursor.execute("""
        SELECT table_name, is_enabled
        FROM bronze.load_jobs
        WHERE is_enabled IS NULL
    """)
    
    null_enabled = db_cursor.fetchall()
    assert len(null_enabled) == 0, \
        f"These jobs have NULL is_enabled values: {null_enabled}"

def test_at_least_one_job_enabled(db_cursor):
    """Verify at least one job is enabled for loading."""
    db_cursor.execute("""
        SELECT COUNT(*)
        FROM bronze.load_jobs
        WHERE is_enabled = TRUE
    """)
    
    enabled_count = db_cursor.fetchone()[0]
    assert enabled_count > 0, "At least one job should be enabled"

def test_table_name_is_primary_key(db_cursor):
    """Verify table_name column is the primary key."""
    db_cursor.execute("""
        SELECT COUNT(*)
        FROM information_schema.table_constraints
        WHERE table_schema = 'bronze'
        AND table_name = 'load_jobs'
        AND constraint_type = 'PRIMARY KEY'
    """)
    
    pk_count = db_cursor.fetchone()[0]
    assert pk_count == 1, "bronze.load_jobs should have a primary key"
    
    # Verify it's on table_name column
    db_cursor.execute("""
        SELECT column_name
        FROM information_schema.key_column_usage
        WHERE table_schema = 'bronze'
        AND table_name = 'load_jobs'
        AND constraint_name IN (
            SELECT constraint_name
            FROM information_schema.table_constraints
            WHERE table_schema = 'bronze'
            AND table_name = 'load_jobs'
            AND constraint_type = 'PRIMARY KEY'
        )
    """)
    
    pk_column = db_cursor.fetchone()[0]
    assert pk_column == 'table_name', \
        f"Primary key should be on table_name column, got: {pk_column}"

## Test Suite 7: Idempotency Validation

In [None]:
%%ipytest -vv

def test_re_registration_is_safe(db_cursor):
    """Verify calling setup.seed_load_jobs() multiple times doesn't create duplicates."""
    # Count jobs before
    db_cursor.execute("SELECT COUNT(*) FROM bronze.load_jobs")
    count_before = db_cursor.fetchone()[0]
    
    # Re-run the seeder
    db_cursor.execute("CALL setup.seed_load_jobs()")
    
    # Count jobs after
    db_cursor.execute("SELECT COUNT(*) FROM bronze.load_jobs")
    count_after = db_cursor.fetchone()[0]
    
    assert count_before == count_after, \
        f"Job count changed after re-registration: {count_before} → {count_after}"

def test_upsert_updates_existing_records(db_cursor):
    """Verify re-registration updates existing records (doesn't just ignore)."""
    # Get a sample job
    db_cursor.execute("""
        SELECT table_name, file_path, load_order
        FROM bronze.load_jobs
        LIMIT 1
    """)
    
    result = db_cursor.fetchone()
    if result:
        table_name, original_path, original_order = result
        
        # Temporarily modify the record
        db_cursor.execute("""
            UPDATE bronze.load_jobs
            SET file_path = 'TEMP_TEST_PATH.csv'
            WHERE table_name = %s
        """, (table_name,))
        
        # Re-run seeder (should restore correct path)
        db_cursor.execute("CALL setup.seed_load_jobs()")
        
        # Verify it was updated back
        db_cursor.execute("""
            SELECT file_path
            FROM bronze.load_jobs
            WHERE table_name = %s
        """, (table_name,))
        
        restored_path = db_cursor.fetchone()[0]
        assert restored_path != 'TEMP_TEST_PATH.csv', \
            "Re-registration should update existing records"
        assert restored_path == original_path, \
            f"Path should be restored to original: {original_path}"

def test_primary_key_prevents_duplicates(db_cursor):
    """Verify primary key constraint prevents duplicate table entries."""
    # Get a sample job
    db_cursor.execute("""
        SELECT table_name, file_path, is_enabled, load_order
        FROM bronze.load_jobs
        LIMIT 1
    """)
    
    result = db_cursor.fetchone()
    if result:
        table_name, file_path, is_enabled, load_order = result
        
        # Try to insert duplicate (should fail)
        try:
            db_cursor.execute("""
                INSERT INTO bronze.load_jobs (table_name, file_path, is_enabled, load_order)
                VALUES (%s, %s, %s, %s)
            """, (table_name, file_path, is_enabled, load_order))
            
            # If we get here, the constraint didn't work
            assert False, "Primary key should prevent duplicate inserts"
            
        except psycopg2.errors.UniqueViolation:
            # This is expected - rollback and pass
            db_cursor.connection.rollback()
            db_cursor.connection.autocommit = True
            assert True

## Summary: Run All Tests

In [None]:
# Run all tests in this notebook
ipytest.run('-vv')

## Manual Inspection: Job Registry Details

In [None]:
# Connect to warehouse database
conn = psycopg2.connect(**DB_CONFIG)

# Get comprehensive job registry information
df_jobs = pd.read_sql("""
    SELECT
        table_name,
        file_path,
        is_enabled,
        load_order,
        CASE
            WHEN load_order < 1000 THEN 'CRM'
            ELSE 'ERP'
        END AS source_system,
        SPLIT_PART(file_path, '/', -1) AS filename
    FROM bronze.load_jobs
    ORDER BY load_order
""", conn)

print("\n📋 Registered Bronze Load Jobs:")
display(df_jobs)

# Get job count by source system
df_job_counts = pd.read_sql("""
    SELECT
        CASE
            WHEN table_name LIKE 'bronze.crm_%' THEN 'CRM'
            WHEN table_name LIKE 'bronze.erp_%' THEN 'ERP'
            ELSE 'Other'
        END AS source_system,
        COUNT(*) AS job_count,
        SUM(CASE WHEN is_enabled THEN 1 ELSE 0 END) AS enabled_count,
        MIN(load_order) AS min_load_order,
        MAX(load_order) AS max_load_order
    FROM bronze.load_jobs
    GROUP BY source_system
    ORDER BY min_load_order
""", conn)

print("\n📊 Job Statistics by Source System:")
display(df_job_counts)

# Get configuration values used
df_config = pd.read_sql("""
    SELECT
        config_key,
        config_value,
        CASE
            WHEN config_value LIKE '%/' THEN '⚠️  Has trailing slash'
            ELSE '✅ Correct format'
        END AS validation
    FROM public.etl_config
    WHERE config_key IN ('base_path_crm', 'base_path_erp')
    ORDER BY config_key
""", conn)

print("\n⚙️  Configuration Used for Job Registration:")
display(df_config)

# Get file path validation results
df_path_validation = pd.read_sql("""
    SELECT
        table_name,
        file_path,
        CASE
            WHEN file_path IS NULL THEN '❌ NULL path'
            WHEN file_path LIKE '%//%' THEN '❌ Double slashes'
            WHEN NOT file_path LIKE '%.csv' THEN '❌ Wrong extension'
            ELSE '✅ Valid'
        END AS path_status
    FROM bronze.load_jobs
    ORDER BY
        CASE
            WHEN file_path IS NULL THEN 1
            WHEN file_path LIKE '%//%' THEN 2
            WHEN NOT file_path LIKE '%.csv' THEN 3
            ELSE 4
        END,
        table_name
""", conn)

print("\n🔍 File Path Validation:")
display(df_path_validation)

# Check for any tables without jobs
df_missing_jobs = pd.read_sql("""
    SELECT
        format('bronze.%I', t.table_name) AS table_name,
        '⚠️  No job registered' AS status
    FROM information_schema.tables t
    WHERE t.table_schema = 'bronze'
    AND t.table_type = 'BASE TABLE'
    AND t.table_name NOT IN ('load_jobs', 'load_log')
    AND NOT EXISTS (
        SELECT 1
        FROM bronze.load_jobs lj
        WHERE lj.table_name = format('bronze.%I', t.table_name)
    )
    ORDER BY table_name
""", conn)

print("\n⚠️  Tables Missing Job Registration:")
if len(df_missing_jobs) > 0:
    display(df_missing_jobs)
else:
    print("   ✅ All bronze tables have job registrations")

conn.close()
print("\n✅ Inspection complete")

## Load Order Visualization

In [None]:
# Visual representation of load order
conn = psycopg2.connect(**DB_CONFIG)
df_order = pd.read_sql("""
    SELECT
        load_order,
        table_name,
        SPLIT_PART(file_path, '/', -1) AS file,
        is_enabled,
        CASE
            WHEN load_order < 1000 THEN '🟦 CRM'
            ELSE '🟨 ERP'
        END AS source
    FROM bronze.load_jobs
    ORDER BY load_order
""", conn)

print("\n📈 Load Execution Sequence:")
print("=" * 80)
for idx, row in df_order.iterrows():
    status = "✅" if row['is_enabled'] else "⏸️"
    print(f"{status} [{row['load_order']:4d}] {row['source']} | {row['table_name']:30s} ← {row['file']}")
print("=" * 80)

conn.close()