# Silver Layer Validation
Queries to sanity-check Bronze → Silver transformations. Adjust the parameters below, re-run, and review counts, null rates, and detailed windows.

In [None]:
from pyspark.sql import functions as F
from datetime import datetime, timezone, timedelta
from aq_lakehouse.spark_session import build

spark = build("silver_validation_notebook")
spark.conf.set("spark.sql.session.timeZone", "UTC")

In [None]:
# Configure the baseline window
START_TS = datetime(2024, 1, 1, 0, 0, tzinfo=timezone.utc)
END_TS = datetime(2024, 1, 7, 23, 0, tzinfo=timezone.utc)
TARGET_LOCATION = "Hà Nội"  # set to None to inspect all locations

In [None]:
def load_range(table: str, ts_column: str, start_ts=START_TS, end_ts=END_TS, location_id=TARGET_LOCATION):
    df = spark.table(table).where((F.col(ts_column) >= F.lit(start_ts)) & (F.col(ts_column) <= F.lit(end_ts)))
    if location_id:
        df = df.where(F.col("location_id") == location_id)
    return df

def counts_by_day_location(table: str, ts_column: str):
    df = load_range(table, ts_column)
    if df.rdd.isEmpty():
        print(f"{table}: no rows in selected window")
        return
    (
        df.withColumn("date_utc", F.to_date(F.col(ts_column)))
          .groupBy("location_id", "date_utc")
          .count()
          .orderBy("location_id", "date_utc")
          .show(truncate=False)
    )

def null_rates(df):
    total = df.count()
    if total == 0:
        print("No rows -> no null stats")
        return
    metrics = [
        F.avg(F.when(F.col(c).isNull(), 1.0).otherwise(0.0)).alias(c)
        for c in df.columns
    ]
    df.select(metrics).show(vertical=True, truncate=False)

def sample_window(table: str, ts_column: str, hours: int = 24):
    df = load_range(table, ts_column)
    first = df.orderBy(ts_column).select(ts_column).limit(1).collect()
    if not first:
        print(f"{table}: no rows available")
        return
    start = first[0][0]
    end = start + timedelta(hours=hours - 1)
    (
        df.where((F.col(ts_column) >= F.lit(start)) & (F.col(ts_column) <= F.lit(end)))
          .orderBy(ts_column)
          .show(truncate=False)
    )


## Counts by day & location

In [None]:
counts_by_day_location("hadoop_catalog.aq.raw_open_meteo_hourly", "ts")
counts_by_day_location("hadoop_catalog.aq.silver.air_quality_hourly_clean", "ts_utc")
counts_by_day_location("hadoop_catalog.aq.silver.aq_components_hourly", "ts_utc")
counts_by_day_location("hadoop_catalog.aq.silver.aq_index_hourly", "ts_utc")

## Null ratios

In [None]:
null_rates(load_range("hadoop_catalog.aq.silver.air_quality_hourly_clean", "ts_utc"))
null_rates(load_range("hadoop_catalog.aq.silver.aq_components_hourly", "ts_utc"))
null_rates(load_range("hadoop_catalog.aq.silver.aq_index_hourly", "ts_utc"))

## Sample windows

In [None]:
sample_window("hadoop_catalog.aq.silver.air_quality_hourly_clean", "ts_utc", hours=24)
sample_window("hadoop_catalog.aq.silver.aq_components_hourly", "ts_utc", hours=24)
sample_window("hadoop_catalog.aq.silver.aq_index_hourly", "ts_utc", hours=24)

## Quick SQL checks
Tạo nhanh truy vấn SQL để đối chiếu số lượng và null-rate giữa Bronze và Silver cho cùng cửa sổ. Chỉnh sửa tuỳ ý trước khi chạy.

In [None]:
from textwrap import dedent

START_STR = START_TS.strftime('%Y-%m-%d %H:%M:%S')
END_STR = END_TS.strftime('%Y-%m-%d %H:%M:%S')


def daily_counts_sql(table: str, ts_col: str) -> str:
    base = (
        f"SELECT location_id, DATE({ts_col}) AS date_utc, COUNT(*) AS rows
"
        f"FROM {table}
"
        f"WHERE {ts_col} BETWEEN TIMESTAMP '{START_STR}'
"
        f"                   AND TIMESTAMP '{END_STR}'"
    )
    if TARGET_LOCATION:
        base += f"
  AND location_id = '{TARGET_LOCATION}'"
    base += "
GROUP BY location_id, DATE({ts_col})
ORDER BY location_id, date_utc"
    return base

for table, ts_col in [
    ("hadoop_catalog.aq.raw_open_meteo_hourly", "ts"),
    ("hadoop_catalog.aq.silver.air_quality_hourly_clean", "ts_utc"),
    ("hadoop_catalog.aq.silver.aq_components_hourly", "ts_utc"),
    ("hadoop_catalog.aq.silver.aq_index_hourly", "ts_utc"),
]:
    sql = daily_counts_sql(table, ts_col)
    print(f"
-- {table}
{sql}
")
    spark.sql(sql).show(truncate=False)
