In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

# 1. Load tables with explicit verification
pm25_df = spark.read.table("dbo.Bronze_PM25")
pm10_df = spark.read.table("dbo.Bronze_PM10")
o3_df = spark.read.table("dbo.Bronze_O3")
no2_df = spark.read.table("dbo.Bronze_NO2")

# DEBUG: Verify each table count
print("üîç VERIFYING INDIVIDUAL TABLE COUNTS:")
print(f"  PM25: {pm25_df.count()} rows")
print(f"  PM10: {pm10_df.count()} rows")
print(f"  O3: {o3_df.count()} rows")
print(f"  NO2: {no2_df.count()} rows")

total_expected = pm25_df.count() + pm10_df.count() + o3_df.count() + no2_df.count()
print(f"  EXPECTED TOTAL: {total_expected} rows")

# 2. Combine step by step to find where count drops
print("\nüîç COMBINING STEP BY STEP:")
step1 = pm25_df.unionAll(pm10_df)
print(f"  PM25 + PM10 = {step1.count()} rows")

step2 = step1.unionAll(o3_df)
print(f"  + O3 = {step2.count()} rows")

combined = step2.unionAll(no2_df)
print(f"  + NO2 = {combined.count()} rows")

# 3. Check for duplicate removal in union
print("\nüîç CHECKING FOR DUPLICATES:")
# UNION removes duplicates, UNION ALL keeps all
distinct_combined = combined.distinct()
print(f"  Distinct rows in combined: {distinct_combined.count()}")
if distinct_combined.count() < combined.count():
    print(f"  ‚ö†Ô∏è Found {combined.count() - distinct_combined.count()} duplicate rows!")

# 4. Check if data is actually being combined
print("\nüîç CHECKING PARAMETER DISTRIBUTION:")
if "param_name" in combined.columns:
    param_counts = combined.groupBy("param_name").count().collect()
    print("  Parameter distribution:")
    for row in param_counts:
        print(f"    {row['param_name']}: {row['count']} rows")
else:
    # Check first few values to see if all data is there
    print("  Sample values from combined table:")
    combined.select("value").show(20)

# 5. Save to Lakehouse
table_name = "Bronze_Open_Air"
print(f"\nüíæ SAVING TO: dbo.{table_name}")

# Force repartition to avoid any Spark optimization issues
combined.repartition(4).write \
    .mode("overwrite") \
    .format("delta") \
    .saveAsTable(f"dbo.{table_name}")

print(f"‚úÖ Saved as table: dbo.{table_name}")

# 6. Verify with fresh read
print("\nüîç VERIFYING SAVED TABLE:")
saved_df = spark.read.table(f"dbo.{table_name}")
print(f"  Verified: {saved_df.count()} rows in saved table")

# Double-check with SQL
spark.sql(f"SELECT COUNT(*) as count FROM dbo.{table_name}").show()

print(f"\nüìä FINAL RESULT: {saved_df.count()} rows (Expected: {total_expected})")
if saved_df.count() == total_expected:
    print("üéâ SUCCESS! All 2000 rows saved!")
else:
    print(f"‚ö†Ô∏è  ISSUE: Missing {total_expected - saved_df.count()} rows")
    
saved_df.show(10)

StatementMeta(, 5de841aa-3bcf-415a-9ecd-dc523e8f7bb9, 3, Finished, Available, Finished)

üîç VERIFYING INDIVIDUAL TABLE COUNTS:
  PM25: 500 rows
  PM10: 500 rows
  O3: 500 rows
  NO2: 500 rows
  EXPECTED TOTAL: 2000 rows

üîç COMBINING STEP BY STEP:
  PM25 + PM10 = 1000 rows
  + O3 = 1500 rows
  + NO2 = 2000 rows

üîç CHECKING FOR DUPLICATES:
  Distinct rows in combined: 2000

üîç CHECKING PARAMETER DISTRIBUTION:
  Parameter distribution:
    PM2.5: 500 rows
    PM10: 500 rows
    O3: 500 rows
    NO2: 500 rows

üíæ SAVING TO: dbo.Bronze_Open_Air
‚úÖ Saved as table: dbo.Bronze_Open_Air

üîç VERIFYING SAVED TABLE:
  Verified: 2000 rows in saved table
+-----+
|count|
+-----+
| 2000|
+-----+


üìä FINAL RESULT: 2000 rows (Expected: 2000)
üéâ SUCCESS! All 2000 rows saved!
+-------------------+--------------+----------------+----------------+-----+-----+--------+----------+--------------------+------------+--------+-------------+-----------------+--------------------+---------+-----------+
|       utc_datetime|local_datetime|measurement_date|measurement_hour|value| unit