In [0]:
from pyspark.sql.functions import explode, col, arrays_zip, current_timestamp

# Read streaming JSON files with full schema inference
raw_stream = spark.readStream.option("multiline", "true").schema(spark.read.json("/Volumes/weather_demo/landing_zone/landing").schema).json("/Volumes/weather_demo/landing_zone/landing")

# List all keys under 'hourly' dynamically
hourly_cols = raw_stream.select("hourly.*").columns

# Zip all hourly arrays automatically
hourly_struct = raw_stream.select(
    "latitude", "longitude", "city", "fetch_time",
    explode(arrays_zip(*[col(f"hourly.{c}") for c in hourly_cols])).alias("hourly_data")
)

# Flatten all fields dynamically
bronze_stream = hourly_struct.select(
    "latitude", "longitude", "city", "fetch_time",
    *[col(f"hourly_data.{c}").alias(c) for c in hourly_cols]
).withColumn("ingestion_time", current_timestamp())

# Write to Delta table
bronze_stream.writeStream \
    .format("delta") \
    .option("checkpointLocation", "dbfs:/Volumes/weather_demo/checkpoints/bronze_weather/") \
    .outputMode("append") \
    .trigger(availableNow=True) \
    .option("mergeSchema", "true") \
    .table("bronze_weather")


In [0]:
%sql
SELECT * FROM bronze_weather