# Gold Layer: Streaming Ingestion
## Use Case: User Activity per Event Type per Day
### Aggregate:
count of events per event_type AND grouped by day

### Inputs/Outputs:
Input Silver Table: /mnt/s3mock/silver/events
Output Gold Table: /mnt/s3mock/gold/aggregates

### 1. Init Spark and S3 paths

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, approx_count_distinct

spark = (
    SparkSession.builder.appName("GoldAggregation")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .getOrCreate()
)

# S3 Paths
silver_path = "/mnt/s3mock/silver/realtime/events"
gold_path = "/mnt/s3mock/gold/realtime/aggregates"
checkpoint_path = "/mnt/s3mock/checkpoints/gold_aggregates"


### 2. Core logic

In [None]:
silver_df = spark.readStream.format("delta").load(silver_path)

agg_df = silver_df \
    .withColumn("event_date", to_date(col("event_timestamp"))) \
    .groupBy("event_date", "event_type") \
    .agg(
        approx_count_distinct("user_id").alias("unique_users"),
        approx_count_distinct("timestamp").alias("event_count")
    )

query = agg_df.writeStream \
    .format("delta") \
    .outputMode("complete") \
    .option("checkpointLocation", checkpoint_path) \
    .start(gold_path)

query.awaitTermination()