# Test Suite: Bronze Data Tables DDL

**Purpose:** Validate the creation and structure of bronze layer data tables

**Scope:**
- Table existence (all 6 bronze data tables)
- Schema ownership and location
- Column definitions and data types
- Table naming convention compliance
- No constraints validation (raw ingestion layer)
- Source system separation (CRM vs ERP)

**Testing Strategy:**
- Existence validation (all 6 tables created)
- Structure validation (correct columns and types)
- Naming validation (matches CSV file conventions)
- Constraint validation (no PKs, FKs, or indexes)
- Isolation validation (tables in bronze schema only)

**Prerequisites:**
- PostgreSQL server running
- sql_retail_analytics_warehouse database exists
- bronze schema exists
- `scripts/bronze/ddl_bronze_tables.sql` has been executed
- Connection credentials available
- Required packages: psycopg2, pytest, ipytest, pandas

## Setup: Import Dependencies & Configure Connection

In [None]:
import os
import psycopg2
from psycopg2 import sql
import pytest
import ipytest
import pandas as pd

# Configure ipytest for notebook usage
ipytest.autoconfig()

# Database connection parameters
DB_CONFIG = {
    'host': 'localhost',
    'database': 'sql_retail_analytics_warehouse',
    'user': 'postgres',
    'password': os.getenv('POSTGRES_PASSWORD', 'your_password_here')
}

# Expected bronze data tables (excludes load_jobs, load_log)
EXPECTED_CRM_TABLES = ['crm_cust_info', 'crm_prd_info', 'crm_sales_details']
EXPECTED_ERP_TABLES = ['erp_CUST_AZ12', 'erp_LOC_A101', 'erp_PX_CAT_G1V2']
EXPECTED_ALL_TABLES = EXPECTED_CRM_TABLES + EXPECTED_ERP_TABLES

print("✅ Dependencies imported successfully")

## Fixtures: Database Connections

In [None]:
@pytest.fixture(scope='module')
def db_connection():
    """Connection to sql_retail_analytics_warehouse database."""
    conn = psycopg2.connect(**DB_CONFIG)
    conn.autocommit = True
    yield conn
    conn.close()

@pytest.fixture(scope='module')
def db_cursor(db_connection):
    """Cursor for warehouse database."""
    cursor = db_connection.cursor()
    yield cursor
    cursor.close()

print("✅ Fixtures defined")

## Test Suite 1: Table Existence

In [None]:
%%ipytest

def test_all_six_bronze_tables_exist(db_cursor):
    """Verify all 6 bronze data tables exist."""
    db_cursor.execute("""
        SELECT table_name
        FROM information_schema.tables
        WHERE table_schema = 'bronze'
        AND table_type = 'BASE TABLE'
        AND table_name NOT IN ('load_jobs', 'load_log')
        ORDER BY table_name
    """)
    
    tables = [row[0] for row in db_cursor.fetchall()]
    assert len(tables) == 6, f"Expected 6 bronze data tables, found {len(tables)}: {tables}"

def test_crm_cust_info_exists(db_cursor):
    """Verify bronze.crm_cust_info table exists."""
    db_cursor.execute("""
        SELECT COUNT(*)
        FROM information_schema.tables
        WHERE table_schema = 'bronze'
        AND table_name = 'crm_cust_info'
    """)
    
    count = db_cursor.fetchone()[0]
    assert count == 1, "bronze.crm_cust_info table must exist"

def test_crm_prd_info_exists(db_cursor):
    """Verify bronze.crm_prd_info table exists."""
    db_cursor.execute("""
        SELECT COUNT(*)
        FROM information_schema.tables
        WHERE table_schema = 'bronze'
        AND table_name = 'crm_prd_info'
    """)
    
    count = db_cursor.fetchone()[0]
    assert count == 1, "bronze.crm_prd_info table must exist"

def test_crm_sales_details_exists(db_cursor):
    """Verify bronze.crm_sales_details table exists."""
    db_cursor.execute("""
        SELECT COUNT(*)
        FROM information_schema.tables
        WHERE table_schema = 'bronze'
        AND table_name = 'crm_sales_details'
    """)
    
    count = db_cursor.fetchone()[0]
    assert count == 1, "bronze.crm_sales_details table must exist"

def test_erp_CUST_AZ12_exists(db_cursor):
    """Verify bronze.erp_CUST_AZ12 table exists."""
    db_cursor.execute("""
        SELECT COUNT(*)
        FROM information_schema.tables
        WHERE table_schema = 'bronze'
        AND table_name = 'erp_CUST_AZ12'
    """)
    
    count = db_cursor.fetchone()[0]
    assert count == 1, "bronze.erp_CUST_AZ12 table must exist"

def test_erp_LOC_A101_exists(db_cursor):
    """Verify bronze.erp_LOC_A101 table exists."""
    db_cursor.execute("""
        SELECT COUNT(*)
        FROM information_schema.tables
        WHERE table_schema = 'bronze'
        AND table_name = 'erp_LOC_A101'
    """)
    
    count = db_cursor.fetchone()[0]
    assert count == 1, "bronze.erp_LOC_A101 table must exist"

def test_erp_PX_CAT_G1V2_exists(db_cursor):
    """Verify bronze.erp_PX_CAT_G1V2 table exists."""
    db_cursor.execute("""
        SELECT COUNT(*)
        FROM information_schema.tables
        WHERE table_schema = 'bronze'
        AND table_name = 'erp_PX_CAT_G1V2'
    """)
    
    count = db_cursor.fetchone()[0]
    assert count == 1, "bronze.erp_PX_CAT_G1V2 table must exist"

## Test Suite 2: CRM Table Structure

In [None]:
%%ipytest

def test_crm_cust_info_columns(db_cursor):
    """Verify crm_cust_info has correct columns."""
    db_cursor.execute("""
        SELECT column_name, data_type
        FROM information_schema.columns
        WHERE table_schema = 'bronze'
        AND table_name = 'crm_cust_info'
        ORDER BY ordinal_position
    """)
    
    columns = {row[0]: row[1] for row in db_cursor.fetchall()}
    
    expected_columns = {
        'customer_id': 'integer',
        'customer_key': 'character varying',
        'customer_first_name': 'character varying',
        'customer_last_name': 'character varying',
        'customer_material_status': 'character varying',
        'customer_gender': 'character varying',
        'customer_create_date': 'date'
    }
    
    for col_name, expected_type in expected_columns.items():
        assert col_name in columns, f"Column '{col_name}' missing from crm_cust_info"
        assert columns[col_name] == expected_type, \
            f"Column '{col_name}' has wrong type: expected {expected_type}, got {columns[col_name]}"

def test_crm_prd_info_columns(db_cursor):
    """Verify crm_prd_info has correct columns."""
    db_cursor.execute("""
        SELECT column_name, data_type
        FROM information_schema.columns
        WHERE table_schema = 'bronze'
        AND table_name = 'crm_prd_info'
        ORDER BY ordinal_position
    """)
    
    columns = {row[0]: row[1] for row in db_cursor.fetchall()}
    
    expected_columns = {
        'product_id': 'integer',
        'product_key': 'character varying',
        'product_nm': 'character varying',
        'product_cost': 'integer',
        'product_line': 'character varying',
        'product_start_date': 'timestamp without time zone',
        'product_end_date': 'timestamp without time zone'
    }
    
    for col_name, expected_type in expected_columns.items():
        assert col_name in columns, f"Column '{col_name}' missing from crm_prd_info"
        assert columns[col_name] == expected_type, \
            f"Column '{col_name}' has wrong type: expected {expected_type}, got {columns[col_name]}"

def test_crm_sales_details_columns(db_cursor):
    """Verify crm_sales_details has correct columns."""
    db_cursor.execute("""
        SELECT column_name, data_type
        FROM information_schema.columns
        WHERE table_schema = 'bronze'
        AND table_name = 'crm_sales_details'
        ORDER BY ordinal_position
    """)
    
    columns = {row[0]: row[1] for row in db_cursor.fetchall()}
    
    expected_columns = {
        'sales_order_number': 'character varying',
        'sales_product_key': 'character varying',
        'sales_customer_id': 'integer',
        'sales_order_date': 'timestamp without time zone',
        'sales_shipping_date': 'date',
        'sales_due_date': 'date',
        'sales_sales': 'integer',
        'sales_quantity': 'integer',
        'sales_price': 'integer'
    }
    
    for col_name, expected_type in expected_columns.items():
        assert col_name in columns, f"Column '{col_name}' missing from crm_sales_details"
        assert columns[col_name] == expected_type, \
            f"Column '{col_name}' has wrong type: expected {expected_type}, got {columns[col_name]}"

## Test Suite 3: ERP Table Structure

In [None]:
%%ipytest

def test_erp_CUST_AZ12_columns(db_cursor):
    """Verify erp_CUST_AZ12 has correct columns."""
    db_cursor.execute("""
        SELECT column_name, data_type
        FROM information_schema.columns
        WHERE table_schema = 'bronze'
        AND table_name = 'erp_CUST_AZ12'
        ORDER BY ordinal_position
    """)
    
    columns = {row[0]: row[1] for row in db_cursor.fetchall()}
    
    expected_columns = {
        'cid': 'character varying',
        'date_of_birth': 'date',
        'gender': 'character varying'
    }
    
    for col_name, expected_type in expected_columns.items():
        assert col_name in columns, f"Column '{col_name}' missing from erp_CUST_AZ12"
        assert columns[col_name] == expected_type, \
            f"Column '{col_name}' has wrong type: expected {expected_type}, got {columns[col_name]}"

def test_erp_LOC_A101_columns(db_cursor):
    """Verify erp_LOC_A101 has correct columns."""
    db_cursor.execute("""
        SELECT column_name, data_type
        FROM information_schema.columns
        WHERE table_schema = 'bronze'
        AND table_name = 'erp_LOC_A101'
        ORDER BY ordinal_position
    """)
    
    columns = {row[0]: row[1] for row in db_cursor.fetchall()}
    
    expected_columns = {
        'cid': 'character varying',
        'country': 'character varying'
    }
    
    for col_name, expected_type in expected_columns.items():
        assert col_name in columns, f"Column '{col_name}' missing from erp_LOC_A101"
        assert columns[col_name] == expected_type, \
            f"Column '{col_name}' has wrong type: expected {expected_type}, got {columns[col_name]}"

def test_erp_PX_CAT_G1V2_columns(db_cursor):
    """Verify erp_PX_CAT_G1V2 has correct columns."""
    db_cursor.execute("""
        SELECT column_name, data_type
        FROM information_schema.columns
        WHERE table_schema = 'bronze'
        AND table_name = 'erp_PX_CAT_G1V2'
        ORDER BY ordinal_position
    """)
    
    columns = {row[0]: row[1] for row in db_cursor.fetchall()}
    
    expected_columns = {
        'id': 'character varying',
        'category': 'character varying',
        'subcategory': 'character varying',
        'maintenance': 'character varying'
    }
    
    for col_name, expected_type in expected_columns.items():
        assert col_name in columns, f"Column '{col_name}' missing from erp_PX_CAT_G1V2"
        assert columns[col_name] == expected_type, \
            f"Column '{col_name}' has wrong type: expected {expected_type}, got {columns[col_name]}"

## Test Suite 4: Table Naming Convention

In [None]:
%%ipytest

def test_all_tables_in_bronze_schema(db_cursor):
    """Verify all data tables are in bronze schema."""
    for table in EXPECTED_ALL_TABLES:
        db_cursor.execute("""
            SELECT table_schema
            FROM information_schema.tables
            WHERE table_name = %s
        """, (table,))
        
        result = db_cursor.fetchone()
        assert result is not None, f"Table '{table}' not found"
        assert result[0] == 'bronze', \
            f"Table '{table}' should be in bronze schema, found in '{result[0]}'"

def test_crm_tables_follow_naming_convention(db_cursor):
    """Verify CRM tables follow crm_ prefix convention."""
    db_cursor.execute("""
        SELECT table_name
        FROM information_schema.tables
        WHERE table_schema = 'bronze'
        AND table_name LIKE 'crm_%'
        ORDER BY table_name
    """)
    
    crm_tables = [row[0] for row in db_cursor.fetchall()]
    
    assert len(crm_tables) == 3, \
        f"Expected 3 CRM tables, found {len(crm_tables)}: {crm_tables}"
    
    for table in EXPECTED_CRM_TABLES:
        assert table in crm_tables, f"Expected CRM table '{table}' not found"

def test_erp_tables_follow_naming_convention(db_cursor):
    """Verify ERP tables follow erp_ prefix convention."""
    db_cursor.execute("""
        SELECT table_name
        FROM information_schema.tables
        WHERE table_schema = 'bronze'
        AND table_name LIKE 'erp_%'
        ORDER BY table_name
    """)
    
    erp_tables = [row[0] for row in db_cursor.fetchall()]
    
    assert len(erp_tables) == 3, \
        f"Expected 3 ERP tables, found {len(erp_tables)}: {erp_tables}"
    
    for table in EXPECTED_ERP_TABLES:
        assert table in erp_tables, f"Expected ERP table '{table}' not found"

## Test Suite 5: No Constraints (Raw Ingestion Layer)

In [None]:
%%ipytest

def test_no_primary_keys_on_bronze_tables(db_cursor):
    """Verify bronze data tables have no primary keys (raw ingestion)."""
    for table in EXPECTED_ALL_TABLES:
        db_cursor.execute("""
            SELECT COUNT(*)
            FROM information_schema.table_constraints
            WHERE table_schema = 'bronze'
            AND table_name = %s
            AND constraint_type = 'PRIMARY KEY'
        """, (table,))
        
        pk_count = db_cursor.fetchone()[0]
        assert pk_count == 0, \
            f"Table '{table}' should not have primary key (raw ingestion layer)"

def test_no_foreign_keys_on_bronze_tables(db_cursor):
    """Verify bronze data tables have no foreign keys."""
    for table in EXPECTED_ALL_TABLES:
        db_cursor.execute("""
            SELECT COUNT(*)
            FROM information_schema.table_constraints
            WHERE table_schema = 'bronze'
            AND table_name = %s
            AND constraint_type = 'FOREIGN KEY'
        """, (table,))
        
        fk_count = db_cursor.fetchone()[0]
        assert fk_count == 0, \
            f"Table '{table}' should not have foreign keys (relationships in silver/gold)"

def test_no_unique_constraints_on_bronze_tables(db_cursor):
    """Verify bronze data tables have no unique constraints."""
    for table in EXPECTED_ALL_TABLES:
        db_cursor.execute("""
            SELECT COUNT(*)
            FROM information_schema.table_constraints
            WHERE table_schema = 'bronze'
            AND table_name = %s
            AND constraint_type = 'UNIQUE'
        """, (table,))
        
        unique_count = db_cursor.fetchone()[0]
        assert unique_count == 0, \
            f"Table '{table}' should not have unique constraints (duplicates allowed)"

def test_no_check_constraints_on_bronze_tables(db_cursor):
    """Verify bronze data tables have no check constraints."""
    for table in EXPECTED_ALL_TABLES:
        db_cursor.execute("""
            SELECT COUNT(*)
            FROM information_schema.table_constraints
            WHERE table_schema = 'bronze'
            AND table_name = %s
            AND constraint_type = 'CHECK'
        """, (table,))
        
        check_count = db_cursor.fetchone()[0]
        assert check_count == 0, \
            f"Table '{table}' should not have check constraints (validation in silver)"

def test_no_indexes_on_bronze_tables(db_cursor):
    """Verify bronze data tables have no indexes (write-optimized)."""
    for table in EXPECTED_ALL_TABLES:
        db_cursor.execute("""
            SELECT COUNT(*)
            FROM pg_indexes
            WHERE schemaname = 'bronze'
            AND tablename = %s
        """, (table,))
        
        index_count = db_cursor.fetchone()[0]
        assert index_count == 0, \
            f"Table '{table}' should not have indexes (raw ingestion prioritizes write speed)"

## Test Suite 6: Source System Separation

In [None]:
%%ipytest

def test_crm_and_erp_table_counts(db_cursor):
    """Verify correct distribution: 3 CRM tables, 3 ERP tables."""
    # Count CRM tables
    db_cursor.execute("""
        SELECT COUNT(*)
        FROM information_schema.tables
        WHERE table_schema = 'bronze'
        AND table_name LIKE 'crm_%'
    """)
    crm_count = db_cursor.fetchone()[0]
    
    # Count ERP tables
    db_cursor.execute("""
        SELECT COUNT(*)
        FROM information_schema.tables
        WHERE table_schema = 'bronze'
        AND table_name LIKE 'erp_%'
    """)
    erp_count = db_cursor.fetchone()[0]
    
    assert crm_count == 3, f"Expected 3 CRM tables, found {crm_count}"
    assert erp_count == 3, f"Expected 3 ERP tables, found {erp_count}"

def test_table_names_exact_match(db_cursor):
    """Verify exact table names (case-sensitive check)."""
    db_cursor.execute("""
        SELECT table_name
        FROM information_schema.tables
        WHERE table_schema = 'bronze'
        AND table_type = 'BASE TABLE'
        AND table_name NOT IN ('load_jobs', 'load_log')
        ORDER BY table_name
    """)
    
    actual_tables = sorted([row[0] for row in db_cursor.fetchall()])
    expected_tables = sorted(EXPECTED_ALL_TABLES)
    
    assert actual_tables == expected_tables, \
        f"Table names don't match exactly.\nExpected: {expected_tables}\nActual: {actual_tables}"

## Summary: Run All Tests

In [None]:
# Run all tests in this notebook
ipytest.run('-v')

## Manual Inspection: Table Details

In [None]:
# Connect to warehouse database
conn = psycopg2.connect(**DB_CONFIG)

# Get comprehensive table information
df_tables = pd.read_sql("""
    SELECT
        table_name,
        CASE
            WHEN table_name LIKE 'crm_%' THEN 'CRM'
            WHEN table_name LIKE 'erp_%' THEN 'ERP'
            ELSE 'Other'
        END AS source_system,
        (
            SELECT COUNT(*)
            FROM information_schema.columns c
            WHERE c.table_schema = t.table_schema
            AND c.table_name = t.table_name
        ) AS column_count,
        pg_size_pretty(pg_total_relation_size(format('%I.%I', table_schema, table_name))) AS total_size
    FROM information_schema.tables t
    WHERE table_schema = 'bronze'
    AND table_type = 'BASE TABLE'
    AND table_name NOT IN ('load_jobs', 'load_log')
    ORDER BY source_system, table_name
""", conn)

print("\n📊 Bronze Data Tables:")
display(df_tables)

# Get detailed column information for all tables
df_columns = pd.read_sql("""
    SELECT
        table_name,
        column_name,
        data_type,
        character_maximum_length,
        is_nullable,
        column_default
    FROM information_schema.columns
    WHERE table_schema = 'bronze'
    AND table_name IN ('crm_cust_info', 'crm_prd_info', 'crm_sales_details',
                       'erp_CUST_AZ12', 'erp_LOC_A101', 'erp_PX_CAT_G1V2')
    ORDER BY table_name, ordinal_position
""", conn)

print("\n📋 Column Details:")
display(df_columns)

# Check for any constraints (should be none)
df_constraints = pd.read_sql("""
    SELECT
        table_name,
        constraint_type,
        constraint_name
    FROM information_schema.table_constraints
    WHERE table_schema = 'bronze'
    AND table_name IN ('crm_cust_info', 'crm_prd_info', 'crm_sales_details',
                       'erp_CUST_AZ12', 'erp_LOC_A101', 'erp_PX_CAT_G1V2')
    ORDER BY table_name, constraint_type
""", conn)

print("\n🔒 Constraints (should be empty):")
if len(df_constraints) > 0:
    display(df_constraints)
    print("⚠️  WARNING: Bronze tables should not have constraints!")
else:
    print("   ✅ No constraints found (correct for raw ingestion layer)")

# Check for any indexes (should be none)
df_indexes = pd.read_sql("""
    SELECT
        tablename,
        indexname,
        indexdef
    FROM pg_indexes
    WHERE schemaname = 'bronze'
    AND tablename IN ('crm_cust_info', 'crm_prd_info', 'crm_sales_details',
                      'erp_CUST_AZ12', 'erp_LOC_A101', 'erp_PX_CAT_G1V2')
    ORDER BY tablename, indexname
""", conn)

print("\n📇 Indexes (should be empty):")
if len(df_indexes) > 0:
    display(df_indexes)
    print("⚠️  WARNING: Bronze tables should not have indexes!")
else:
    print("   ✅ No indexes found (correct for write-optimized ingestion)")

# Summary statistics
df_summary = pd.read_sql("""
    SELECT
        CASE
            WHEN table_name LIKE 'crm_%' THEN 'CRM'
            WHEN table_name LIKE 'erp_%' THEN 'ERP'
        END AS source_system,
        COUNT(*) AS table_count,
        SUM((
            SELECT COUNT(*)
            FROM information_schema.columns c
            WHERE c.table_schema = t.table_schema
            AND c.table_name = t.table_name
        )) AS total_columns
    FROM information_schema.tables t
    WHERE table_schema = 'bronze'
    AND table_type = 'BASE TABLE'
    AND table_name NOT IN ('load_jobs', 'load_log')
    GROUP BY source_system
    ORDER BY source_system
""", conn)

print("\n📈 Summary by Source System:")
display(df_summary)

conn.close()
print("\n✅ Inspection complete")