#Modul 5 - Part 1: Structured Streaming

Inhaltsverzeichnis tbd

##5.1. Setup und Dataset laden

In [0]:
# DBFS Pfad
DATA_PATH = "workspace.default.yellow_tripdata_2025_01" # "/FileStore/tables/yellow_tripdata_2025_01-1.parquet"
LOOKUP_PATH = "workspace.default.df_lookup"

# DataFrame laden
df_taxi = spark.read.table(DATA_PATH)
df_lookup = spark.read.table(LOOKUP_PATH)

## 5.2. Streaming Quelle erzeugen

In [0]:
#spark.sql("DROP TABLE IF EXISTS workspace.output.taxi_full");
#spark.sql("DROP TABLE IF EXISTS workspace.output.yellow_tripdata_2025_01");

#spark.sql("DROP SCHEMA IF EXISTS workspace.output");

spark.sql("CREATE SCHEMA IF NOT EXISTS workspace.streaming_input");
spark.sql("CREATE SCHEMA IF NOT EXISTS workspace.streaming_output");
#spark.sql("CREATE SCHEMA IF NOT EXISTS workspace.streaming_checkpoint");

In [0]:
import time

input_path = "workspace.streaming_input.inputtable"
output_path = "workspace.streaming_output.outputtable"
#streaming_checkpoint = "workspace.streaming_checkpoint.checkpointtable"

# Assume you already have taxi_df
rows_per_batch = 100
gewichte = 1/rows_per_batch

# Split DF into batches
batches = df_taxi.randomSplit([gewichte]*rows_per_batch, seed=42)  # 500 mini-batches

for i, batch in enumerate(batches):
    write_mode = "overwrite" if i == 0 else "append"
    batch.write.mode(write_mode).format("delta").saveAsTable(input_path)
    print(f"Wrote batch {i+1}")
    time.sleep(5)



##5.3. Stream einlesen

In [0]:
streaming_df = (
    spark.readStream
         .table(input_path)
)


## 5.4. Stream verarbeiten durch Logik

In [0]:
from pyspark.sql.functions import avg, count

agg_df = (
    streaming_df
        .groupBy("passenger_count")
        .agg(
            count("*").alias("trip_count"),
            avg("fare_amount").alias("avg_fare")
        )
)


##5.5. Stream schreiben in Zieltabelle 

In [0]:
dbutils.fs.rm("/Volumes/workspace/streaming_output/checkpoint", recurse=True)

In [0]:
volume = "/Volumes/workspace/streaming_output/checkpoint"  # just the path

query = (
    agg_df.writeStream
         .outputMode("complete")              # replace results on each trigger
         .option("checkpointLocation", volume)
         .format("delta")
         .trigger(availableNow=True)
         .table(output_path)                 # writes back into catalog
)


In [0]:
%sql
select * from workspace.streaming_output.outputtable

##5.6 Delta Live Tables