# Bronze layer smoke tests

This notebook starts a local Spark session configured to use the Iceberg warehouse from `.env` and runs a few smoke tests on the Bronze table `hadoop_catalog.aq.raw_open_meteo_hourly`.

Checks performed:
- Table exists
- Row count is > 0
- `ts` column has a sensible min and max
- Display a small sample of rows

In [4]:
# Load environment and required modules
from dotenv import load_dotenv
import os
from pyspark.sql import SparkSession, functions as F
import pyspark.sql.types as T

load_dotenv(dotenv_path=os.path.join('..', '.env') if os.path.exists('../.env') else '.env')
WAREHOUSE_URI = os.getenv('WAREHOUSE_URI')
print('WAREHOUSE_URI=', WAREHOUSE_URI)

WAREHOUSE_URI= hdfs://khoa-master:9000/warehouse/iceberg


In [6]:
# Start Spark session for the notebook (self-contained)
# This cell is intentionally self-contained so you can run it before/after other cells.
from pyspark.sql import SparkSession
from pyspark import SparkContext
import os

# Stop any existing Spark contexts
try:
    if 'spark' in locals():
        spark.stop()
except:
    pass

try:
    sc = SparkContext._active_spark_context
    if sc:
        sc.stop()
except:
    pass

# Read WAREHOUSE_URI from environment, fallback to the same default used elsewhere
WAREHOUSE_URI = os.getenv("WAREHOUSE_URI", "hdfs://khoa-master:9000/warehouse/iceberg")

# Minimal build() here mirrors the function defined later in the notebook
def build(app_name: str) -> SparkSession:
    return (
        SparkSession.builder.appName(app_name)
        .master("local[*]")  # Run locally with all available cores
        .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
        .config("spark.sql.catalog.hadoop_catalog", "org.apache.iceberg.spark.SparkCatalog")
        .config("spark.sql.catalog.hadoop_catalog.type", "hadoop")
        .config("spark.sql.catalog.hadoop_catalog.warehouse", WAREHOUSE_URI)
        .config("spark.sql.catalogImplementation", "in-memory")
        .config("spark.sql.execution.arrow.pyspark.enabled", "false")
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
        .config("spark.sql.adaptive.enabled", "false")
        .getOrCreate()
    )

try:
    spark = build("test_data_smoke_tests")
    print("Spark session created successfully!")
    print("Spark master:", spark.sparkContext.master)
    print("Spark UI:", spark.sparkContext.uiWebUrl)
    print("Spark version:", spark.version)
except Exception as e:
    print("Failed to create SparkSession:", e)
    print("If running remotely, make sure this kernel has access to Spark and the Iceberg jars/configs.")

Spark session created successfully!
Spark master: local[*]
Spark UI: http://khoa-master:4040
Spark version: 4.0.0


In [7]:
# Test 1: Check if the bronze table exists
table_name = "hadoop_catalog.aq.raw_open_meteo_hourly"

try:
    # Try to get table metadata
    table_exists = spark.catalog.tableExists(table_name)
    print(f"Table '{table_name}' exists: {table_exists}")
    
    # If table doesn't exist in catalog, try direct SQL approach
    if not table_exists:
        try:
            spark.sql(f"DESCRIBE TABLE {table_name}")
            table_exists = True
            print(f"Table '{table_name}' found via SQL DESCRIBE")
        except Exception as desc_e:
            print(f"Table '{table_name}' not accessible: {desc_e}")
            
except Exception as e:
    print(f"Error checking table existence: {e}")
    table_exists = False

assert table_exists, f"Bronze table '{table_name}' does not exist"
print("✓ Table existence check passed")

Table 'hadoop_catalog.aq.raw_open_meteo_hourly' exists: True
✓ Table existence check passed


In [8]:
# Test 2: Count rows in the bronze table
try:
    count = spark.sql(f"SELECT COUNT(*) as row_count FROM {table_name}").collect()[0].row_count
    print(f"Total rows in '{table_name}': {count}")
except Exception as e:
    print(f"Error counting rows: {e}")
    count = 0

print(f"Row count: {count}")
print("✓ Row count check completed")

[Stage 0:>                                                          (0 + 1) / 1]

Total rows in 'hadoop_catalog.aq.raw_open_meteo_hourly': 45288
Row count: 45288
✓ Row count check completed


                                                                                

In [9]:
# Test 3: Check timestamp column (ts) has sensible min and max values
try:
    min_max_query = f"""
    SELECT 
        MIN(ts) as min_ts,
        MAX(ts) as max_ts,
        COUNT(DISTINCT ts) as unique_timestamps
    FROM {table_name}
    WHERE ts IS NOT NULL
    """
    
    result = spark.sql(min_max_query).collect()[0]
    min_max = {
        'min_ts': result.min_ts,
        'max_ts': result.max_ts,
        'unique_timestamps': result.unique_timestamps
    }
    
    print(f"Timestamp range:")
    print(f"  Min: {min_max['min_ts']}")
    print(f"  Max: {min_max['max_ts']}")
    print(f"  Unique timestamps: {min_max['unique_timestamps']}")
    
except Exception as e:
    print(f"Error checking timestamps: {e}")
    min_max = {'min_ts': None, 'max_ts': None, 'unique_timestamps': 0}

print("✓ Timestamp range check completed")



Timestamp range:
  Min: 2024-01-01 00:00:00
  Max: 2025-09-20 23:00:00
  Unique timestamps: 15096
✓ Timestamp range check completed


                                                                                

In [10]:
# Test 4: Display sample rows from the bronze table
try:
    print(f"Sample rows from '{table_name}':")
    sample_df = spark.sql(f"SELECT * FROM {table_name} LIMIT 5")
    sample_df.show(truncate=False)
    
    # Also show table schema
    print(f"\nTable schema for '{table_name}':")
    sample_df.printSchema()
    
except Exception as e:
    print(f"Error displaying sample data: {e}")

print("✓ Sample data display completed")

Sample rows from 'hadoop_catalog.aq.raw_open_meteo_hourly':
+-----------+---------+----------+-------------------+---------------------+------------------+------------------+----+------------------+-----+------------------+---------------+------------------+------------------+----------+------------------------------------+-------------------------+
|location_id|latitude |longitude |ts                 |aerosol_optical_depth|pm2_5             |pm10              |dust|nitrogen_dioxide  |ozone|sulphur_dioxide   |carbon_monoxide|uv_index          |uv_index_clear_sky|source    |run_id                              |ingested_at              |
+-----------+---------+----------+-------------------+---------------------+------------------+------------------+----+------------------+-----+------------------+---------------+------------------+------------------+----------+------------------------------------+-------------------------+
|Hà Nội     |21.028511|105.804817|2025-04-22 00:00:00|0.62000000

In [11]:
# Simple assertions to fail the notebook if expectations are not met
assert count > 0, 'Bronze table is empty (expected > 0 rows)'
assert min_max['min_ts'] is not None and min_max['max_ts'] is not None, 'ts column has null min/max'
print('All checks passed')

All checks passed
