# Test Suite: Database Creation and Configuration

**Purpose:** Validate the sql_retail_analytics_warehouse database creation and configuration

**Scope:**
- Database existence and naming
- Encoding configuration (UTF-8)
- Locale settings (en_GB.UTF-8)
- Template configuration
- Ownership and privileges
- Connection validation

**Testing Strategy:**
- Existence validation (database created successfully)
- Configuration validation (encoding, collation, ctype)
- Ownership validation (correct owner assigned)
- Connection testing (can establish connections)
- Isolation testing (clean template, no extra objects)

**Prerequisites:**
- PostgreSQL server running
- `setup/create_db.sql` has been executed
- Connection credentials available
- Required packages: psycopg2, pytest, ipytest, pandas

## Setup: Import Dependencies & Configure Connection

In [None]:
import os
import psycopg2
from psycopg2 import sql
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
import pytest
import ipytest
import pandas as pd

# Configure ipytest for notebook usage
ipytest.autoconfig()

# Database connection parameters
DB_CONFIG = {
    'host': 'localhost',
    'user': 'postgres',
    'password': os.getenv('POSTGRES_PASSWORD', 'your_password_here')
}

# Target database name
TARGET_DB = 'sql_retail_analytics_warehouse'

print("✅ Dependencies imported successfully")

## Fixtures: Database Connections

In [None]:
@pytest.fixture(scope='module')
def postgres_connection():
    """Connection to postgres database for catalog queries."""
    conn = psycopg2.connect(database='postgres', **DB_CONFIG)
    conn.autocommit = True
    yield conn
    conn.close()

@pytest.fixture(scope='module')
def postgres_cursor(postgres_connection):
    """Cursor for postgres database."""
    cursor = postgres_connection.cursor()
    yield cursor
    cursor.close()

@pytest.fixture(scope='module')
def target_connection():
    """Connection to target warehouse database."""
    conn = psycopg2.connect(database=TARGET_DB, **DB_CONFIG)
    conn.autocommit = True
    yield conn
    conn.close()

@pytest.fixture(scope='module')
def target_cursor(target_connection):
    """Cursor for target database."""
    cursor = target_connection.cursor()
    yield cursor
    cursor.close()

print("✅ Fixtures defined")

## Test Suite 1: Database Existence

In [None]:
%%ipytest -vv

def test_database_exists(postgres_cursor):
    """Verify sql_retail_analytics_warehouse database exists."""
    postgres_cursor.execute("""
        SELECT COUNT(*)
        FROM pg_database
        WHERE datname = %s
    """, (TARGET_DB,))
    
    count = postgres_cursor.fetchone()[0]
    assert count == 1, f"Database '{TARGET_DB}' must exist"

def test_database_name_exact_match(postgres_cursor):
    """Verify database name matches exactly (case-sensitive)."""
    postgres_cursor.execute("""
        SELECT datname
        FROM pg_database
        WHERE datname = %s
    """, (TARGET_DB,))
    
    result = postgres_cursor.fetchone()
    assert result is not None, f"Database '{TARGET_DB}' not found"
    assert result[0] == TARGET_DB, f"Database name mismatch: expected '{TARGET_DB}', got '{result[0]}'"

def test_database_is_accessible(target_cursor):
    """Verify we can connect to and query the database."""
    target_cursor.execute("SELECT current_database()")
    current_db = target_cursor.fetchone()[0]
    assert current_db == TARGET_DB, f"Connected to wrong database: {current_db}"

## Test Suite 2: Encoding Configuration

In [None]:
%%ipytest -vv

def test_database_encoding_utf8(postgres_cursor):
    """Verify database uses UTF8 encoding."""
    postgres_cursor.execute("""
        SELECT pg_encoding_to_char(encoding) AS encoding
        FROM pg_database
        WHERE datname = %s
    """, (TARGET_DB,))
    
    encoding = postgres_cursor.fetchone()[0]
    assert encoding == 'UTF8', f"Expected UTF8 encoding, got '{encoding}'"

def test_database_encoding_from_connection(target_cursor):
    """Verify encoding setting from within the database."""
    target_cursor.execute("SHOW server_encoding")
    encoding = target_cursor.fetchone()[0]
    assert encoding == 'UTF8', f"Server encoding should be UTF8, got '{encoding}'"

def test_client_encoding_utf8(target_cursor):
    """Verify client encoding is also UTF8."""
    target_cursor.execute("SHOW client_encoding")
    encoding = target_cursor.fetchone()[0]
    assert encoding == 'UTF8', f"Client encoding should be UTF8, got '{encoding}'"

## Test Suite 3: Locale Configuration

In [None]:
%%ipytest -vv

def test_database_collation_en_gb(postgres_cursor):
    """Verify database uses en_GB.UTF-8 collation."""
    postgres_cursor.execute("""
        SELECT datcollate
        FROM pg_database
        WHERE datname = %s
    """, (TARGET_DB,))
    
    collation = postgres_cursor.fetchone()[0]
    assert collation == 'en_GB.UTF-8', f"Expected 'en_GB.UTF-8' collation, got '{collation}'"

def test_database_ctype_en_gb(postgres_cursor):
    """Verify database uses en_GB.UTF-8 character classification."""
    postgres_cursor.execute("""
        SELECT datctype
        FROM pg_database
        WHERE datname = %s
    """, (TARGET_DB,))
    
    ctype = postgres_cursor.fetchone()[0]
    assert ctype == 'en_GB.UTF-8', f"Expected 'en_GB.UTF-8' ctype, got '{ctype}'"

def test_lc_collate_setting(target_cursor):
    """Verify LC_COLLATE setting from within database."""
    target_cursor.execute("SHOW lc_collate")
    lc_collate = target_cursor.fetchone()[0]
    assert lc_collate == 'en_GB.UTF-8', f"LC_COLLATE should be 'en_GB.UTF-8', got '{lc_collate}'"

def test_lc_ctype_setting(target_cursor):
    """Verify LC_CTYPE setting from within database."""
    target_cursor.execute("SHOW lc_ctype")
    lc_ctype = target_cursor.fetchone()[0]
    assert lc_ctype == 'en_GB.UTF-8', f"LC_CTYPE should be 'en_GB.UTF-8', got '{lc_ctype}'"

## Test Suite 4: Template Configuration

In [None]:
%%ipytest -vv

def test_database_allows_connections(postgres_cursor):
    """Verify database allows connections (not a template)."""
    postgres_cursor.execute("""
        SELECT datallowconn
        FROM pg_database
        WHERE datname = %s
    """, (TARGET_DB,))
    
    allows_conn = postgres_cursor.fetchone()[0]
    assert allows_conn is True, "Database should allow connections"

def test_database_not_a_template(postgres_cursor):
    """Verify database is not marked as a template."""
    postgres_cursor.execute("""
        SELECT datistemplate
        FROM pg_database
        WHERE datname = %s
    """, (TARGET_DB,))
    
    is_template = postgres_cursor.fetchone()[0]
    assert is_template is False, "Database should not be a template"

def test_no_active_connections_limit(postgres_cursor):
    """Verify database has no connection limit (-1 = unlimited)."""
    postgres_cursor.execute("""
        SELECT datconnlimit
        FROM pg_database
        WHERE datname = %s
    """, (TARGET_DB,))
    
    conn_limit = postgres_cursor.fetchone()[0]
    assert conn_limit == -1, f"Expected unlimited connections (-1), got {conn_limit}"

## Test Suite 5: Ownership and Privileges

In [None]:
%%ipytest -vv

def test_database_owner(postgres_cursor):
    """Verify database is owned by postgres user."""
    postgres_cursor.execute("""
        SELECT pg_catalog.pg_get_userbyid(d.datdba) AS owner
        FROM pg_catalog.pg_database d
        WHERE d.datname = %s
    """, (TARGET_DB,))
    
    owner = postgres_cursor.fetchone()[0]
    # Owner should be postgres or the creating user
    assert owner is not None, "Database must have an owner"
    assert len(owner) > 0, "Owner name should not be empty"

def test_current_user_can_create_schema(target_cursor):
    """Verify current user has privileges to create schemas."""
    # Try to create a test schema (will rollback)
    target_cursor.execute("""
        CREATE SCHEMA IF NOT EXISTS test_privilege_check
    """)
    
    # Verify it was created
    target_cursor.execute("""
        SELECT COUNT(*)
        FROM information_schema.schemata
        WHERE schema_name = 'test_privilege_check'
    """)
    
    count = target_cursor.fetchone()[0]
    assert count == 1, "User should be able to create schemas"
    
    # Clean up
    target_cursor.execute("DROP SCHEMA IF EXISTS test_privilege_check CASCADE")

## Test Suite 6: Clean State Validation

In [None]:
%%ipytest -vv

def test_expected_default_schemas_only(target_cursor):
    """Verify only default PostgreSQL schemas exist (if using template0)."""
    target_cursor.execute("""
        SELECT schema_name
        FROM information_schema.schemata
        WHERE schema_name NOT IN ('pg_catalog', 'information_schema', 'pg_toast')
        ORDER BY schema_name
    """)
    
    schemas = [row[0] for row in target_cursor.fetchall()]
    
    # Should only have 'public' by default (if clean template0)
    # May have bronze/silver/gold if create_schemas.sql was run
    # This test verifies no unexpected schemas exist
    expected_schemas = {'public', 'bronze', 'silver', 'gold', 'setup'}
    unexpected = set(schemas) - expected_schemas
    
    if unexpected:
        print(f"⚠️  Unexpected schemas found: {unexpected}")
        print(f"   All schemas: {schemas}")
    
    # At minimum, 'public' should exist
    assert 'public' in schemas, "Default 'public' schema should exist"

def test_database_size_reasonable(target_cursor):
    """Verify database size is reasonable for a clean database."""
    target_cursor.execute("""
        SELECT pg_size_pretty(pg_database_size(current_database())) AS size,
               pg_database_size(current_database()) AS size_bytes
    """)
    
    size_pretty, size_bytes = target_cursor.fetchone()
    
    # Clean database should be under 20MB
    max_size_bytes = 20 * 1024 * 1024  # 20MB
    
    print(f"Database size: {size_pretty}")
    assert size_bytes < max_size_bytes, \
        f"Database seems too large for a clean database: {size_pretty}"

## Test Suite 7: Connection and Session Settings

In [None]:
%%ipytest -vv

def test_timezone_setting(target_cursor):
    """Verify timezone is set (should have default or configured value)."""
    target_cursor.execute("SHOW timezone")
    timezone = target_cursor.fetchone()[0]
    assert timezone is not None, "Timezone should be set"
    assert len(timezone) > 0, "Timezone value should not be empty"

def test_datestyle_setting(target_cursor):
    """Verify DateStyle is set to ISO standard."""
    target_cursor.execute("SHOW datestyle")
    datestyle = target_cursor.fetchone()[0]
    # Should contain 'ISO' for consistent date formatting
    assert 'ISO' in datestyle, f"DateStyle should include ISO, got '{datestyle}'"

def test_can_create_table(target_cursor):
    """Verify basic DDL operations work."""
    # Create a test table
    target_cursor.execute("""
        CREATE TABLE IF NOT EXISTS test_table_creation (
            id SERIAL PRIMARY KEY,
            name TEXT NOT NULL
        )
    """)
    
    # Verify it exists
    target_cursor.execute("""
        SELECT COUNT(*)
        FROM information_schema.tables
        WHERE table_name = 'test_table_creation'
    """)
    
    count = target_cursor.fetchone()[0]
    assert count == 1, "Should be able to create tables"
    
    # Clean up
    target_cursor.execute("DROP TABLE IF EXISTS test_table_creation")

def test_can_insert_and_query_utf8_data(target_cursor):
    """Verify UTF-8 data can be inserted and queried correctly."""
    # Create temp table
    target_cursor.execute("""
        CREATE TEMP TABLE test_utf8 (data TEXT)
    """)
    
    # Insert various UTF-8 characters
    test_strings = [
        'Hello World',
        'Café',
        '日本語',  # Japanese
        '🎯📊',    # Emojis
        'Ελληνικά'  # Greek
    ]
    
    for test_str in test_strings:
        target_cursor.execute(
            "INSERT INTO test_utf8 (data) VALUES (%s)",
            (test_str,)
        )
    
    # Query back and verify
    target_cursor.execute("SELECT data FROM test_utf8 ORDER BY data")
    results = [row[0] for row in target_cursor.fetchall()]
    
    # All strings should be retrievable
    assert len(results) == len(test_strings), "All UTF-8 strings should be retrievable"

## Summary: Run All Tests

In [None]:
# Run all tests in this notebook
ipytest.run('-vv')

## Manual Inspection: Database Properties

In [None]:
# Connect to postgres database to query catalog
conn_postgres = psycopg2.connect(database='postgres', **DB_CONFIG)

# Get comprehensive database information
df_db_info = pd.read_sql(f"""
    SELECT
        d.datname                                  AS database_name,
        pg_catalog.pg_get_userbyid(d.datdba)       AS owner,
        pg_catalog.pg_encoding_to_char(d.encoding) AS encoding,
        d.datcollate                               AS collation,
        d.datctype                                 AS ctype,
        d.datallowconn                             AS allows_connections,
        d.datconnlimit                             AS connection_limit,
        d.datistemplate                            AS is_template,
        pg_size_pretty(pg_database_size(d.datname)) AS size
    FROM pg_catalog.pg_database d
    WHERE d.datname = '{TARGET_DB}'
""", conn_postgres)

print("\n🗄️  Database Configuration:")
display(df_db_info.T)  # Transpose for better readability

conn_postgres.close()

# Connect to target database for additional info
conn_target = psycopg2.connect(database=TARGET_DB, **DB_CONFIG)

# Get session settings
df_settings = pd.read_sql("""
    SELECT
        'server_encoding' AS setting,
        current_setting('server_encoding') AS value
    UNION ALL
    SELECT 'client_encoding', current_setting('client_encoding')
    UNION ALL
    SELECT 'lc_collate', current_setting('lc_collate')
    UNION ALL
    SELECT 'lc_ctype', current_setting('lc_ctype')
    UNION ALL
    SELECT 'timezone', current_setting('timezone')
    UNION ALL
    SELECT 'datestyle', current_setting('datestyle')
    ORDER BY setting
""", conn_target)

print("\n⚙️  Session Settings:")
display(df_settings)

# Get list of schemas
df_schemas = pd.read_sql("""
    SELECT
        schema_name,
        pg_catalog.pg_get_userbyid(schema_owner::regrole::oid) AS owner
    FROM information_schema.schemata
    WHERE schema_name NOT IN ('pg_catalog', 'information_schema', 'pg_toast')
    ORDER BY schema_name
""", conn_target)

print("\n📁 Schemas:")
display(df_schemas)

# Get database statistics
df_stats = pd.read_sql("""
    SELECT
        current_database() AS database,
        pg_size_pretty(pg_database_size(current_database())) AS total_size,
        (
            SELECT COUNT(*)
            FROM information_schema.tables
            WHERE table_schema NOT IN ('pg_catalog', 'information_schema')
        ) AS user_tables,
        (
            SELECT COUNT(*)
            FROM information_schema.schemata
            WHERE schema_name NOT IN ('pg_catalog', 'information_schema', 'pg_toast')
        ) AS user_schemas
""", conn_target)

print("\n📊 Database Statistics:")
display(df_stats)

conn_target.close()
print("\n✅ Inspection complete")