In [0]:
input_path = "/Volumes/workspace/ecommerce/ecommerce_data/stream_input/"

dbutils.fs.rm(input_path, True)
dbutils.fs.mkdirs(input_path)

# Dummy files create
dummy_data_1 = "user_id,event_type,price\n1,view,0\n2,purchase,500\n"
dbutils.fs.put(f"{input_path}file1.csv", dummy_data_1, True)

dummy_data_2 = "user_id,event_type,price\n3,cart,100\n4,purchase,1200\n"
dbutils.fs.put(f"{input_path}file2.csv", dummy_data_2, True)

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import current_timestamp

stream_schema = StructType([
    StructField("user_id", IntegerType(), True),
    StructField("event_type", StringType(), True),
    StructField("price", IntegerType(), True)
])

stream_df = (
    spark.readStream
    .schema(stream_schema)
    .option("header", "true")
    .option("maxFilesPerTrigger", 1) 
    .csv(input_path) 
    .withColumn("ingest_timestamp", current_timestamp())
)

print("ReadStream configured securely")

In [0]:
checkpoint_path = "/Volumes/workspace/ecommerce/ecommerce_data/checkpoints/ecommerce_stream_chk"
output_table_name = "workspace.ecommerce.streaming_output"

dbutils.fs.rm(checkpoint_path, True)
spark.sql(f"DROP TABLE IF EXISTS {output_table_name}")

# Streaming start
streaming_query = (
    stream_df.writeStream
    .format("delta")
    .outputMode("append")
    .option("checkpointLocation", checkpoint_path) 
    .trigger(availableNow=True) 
    .toTable(output_table_name)
)

In [0]:
import time

time.sleep(5)

display(spark.sql(f"SELECT * FROM {output_table_name} ORDER BY ingest_timestamp DESC"))

In [0]:
streaming_query.stop()