# Silver Layer Validation
Queries to sanity-check Bronze → Silver transformations. Adjust the parameters below, re-run, and review counts, null rates, and detailed windows.

In [None]:
import os
import sys
from pyspark.sql import functions as F
from datetime import datetime, timezone, timedelta

repo_root = os.path.abspath('..')
src_path = os.path.join(repo_root, 'src')
if src_path not in sys.path:
    sys.path.insert(0, src_path)

from aq_lakehouse.spark_session import build

spark = build('silver_validation_notebook')
spark.conf.set('spark.sql.session.timeZone', 'UTC')


In [None]:
# Configure the baseline window
START_TS = datetime(2024, 1, 1, 0, 0, tzinfo=timezone.utc)
END_TS = datetime(2024, 9, 1, 23, 0, tzinfo=timezone.utc)
TARGET_LOCATION = "Hà Nội"  # set to None to inspect all locations "Hà Nội"

In [None]:
def load_range(table: str, ts_column: str, start_ts=START_TS, end_ts=END_TS, location_id=TARGET_LOCATION):
    if not spark.catalog.tableExists(table):
        print(f"{table}: table does not exist in catalog")
        return None
    try:
        df = spark.table(table)
    except Exception as exc:
        print(f"{table}: unable to read table -> {exc}")
        return None

    df = df.where((F.col(ts_column) >= F.lit(start_ts)) & (F.col(ts_column) <= F.lit(end_ts)))
    if location_id:
        df = df.where(F.col("location_id") == location_id)
    return df

def _safe_has_rows(df):
    try:
        return df.limit(1).count() > 0
    except Exception as exc:
        print(f"Failed to check for rows -> {exc}")
        return False

def counts_by_day_location(table: str, ts_column: str):
    df = load_range(table, ts_column)
    if df is None:
        return
    if not _safe_has_rows(df):
        print(f"{table}: no rows in selected window or unable to read data")
        return
    try:
        (
            df.withColumn("date_utc", F.to_date(F.col(ts_column)))
              .groupBy("location_id", "date_utc")
              .count()
              .orderBy("location_id", "date_utc")
              .show(truncate=False)
        )
    except Exception as exc:
        print(f"{table}: failed to compute counts -> {exc}")

def null_rates(table: str, ts_column: str):
    df = load_range(table, ts_column)
    if df is None:
        return
    if not _safe_has_rows(df):
        print("No rows -> no null stats")
        return
    metrics = [
        F.avg(F.when(F.col(c).isNull(), 1.0).otherwise(0.0)).alias(c)
        for c in df.columns
    ]
    try:
        df.select(metrics).show(vertical=True, truncate=False)
    except Exception as exc:
        print(f"Failed to show null rates -> {exc}")

def sample_window(table: str, ts_column: str, hours: int = 24):
    df = load_range(table, ts_column)
    if df is None:
        return
    if not _safe_has_rows(df):
        print(f"{table}: no rows available")
        return
    try:
        start = df.orderBy(ts_column).select(ts_column).first()[0]
        end = start + timedelta(hours=hours - 1)
        (
            df.where((F.col(ts_column) >= F.lit(start)) & (F.col(ts_column) <= F.lit(end)))
              .orderBy(ts_column)
              .show(truncate=False)
        )
    except Exception as exc:
        print(f"{table}: failed to show sample window -> {exc}")


## Counts by day & location

In [None]:
counts_by_day_location("hadoop_catalog.aq.raw_open_meteo_hourly", "ts")
counts_by_day_location("hadoop_catalog.aq.silver.air_quality_hourly_clean", "ts_utc")
counts_by_day_location("hadoop_catalog.aq.silver.aq_components_hourly", "ts_utc")
counts_by_day_location("hadoop_catalog.aq.silver.aq_index_hourly", "ts_utc")

## Null ratios

In [None]:
null_rates(load_range("hadoop_catalog.aq.silver.air_quality_hourly_clean", "ts_utc"))
null_rates(load_range("hadoop_catalog.aq.silver.aq_components_hourly", "ts_utc"))
null_rates(load_range("hadoop_catalog.aq.silver.aq_index_hourly", "ts_utc"))

## Sample windows

In [None]:
sample_window("hadoop_catalog.aq.silver.air_quality_hourly_clean", "ts_utc", hours=24)
sample_window("hadoop_catalog.aq.silver.aq_components_hourly", "ts_utc", hours=24)
sample_window("hadoop_catalog.aq.silver.aq_index_hourly", "ts_utc", hours=24)

## Quick SQL checks
Tạo nhanh truy vấn SQL để đối chiếu số lượng và null-rate giữa Bronze và Silver cho cùng cửa sổ. Chỉnh sửa tuỳ ý trước khi chạy.