In [1]:
# This notebook is the 'static' version of the pipeline, it loads data
# from JSON files in batch mode and writes them into a Delta table.

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    FloatType,
    IntegerType,
    TimestampType,
)
from pathlib import Path
from delta import configure_spark_with_delta_pip
from delta.tables import DeltaTable

delta_package = "io.delta:delta-spark_2.13:4.0.0"

builder = (
    SparkSession.builder.appName("SmartTech_Streaming")
    .config("spark.jars.packages", delta_package)
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)


spark = configure_spark_with_delta_pip(builder).getOrCreate()

print(f"Spark version : {spark.version}")
print("Connecteur Delta chargé avec succès.")


:: loading settings :: url = jar:file:/Users/jean-thomasmiquelot/kDrive/PROGRAMMATION/simplon/Simplon_projets/esther_spark_streaming/.venv/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /Users/jean-thomasmiquelot/.ivy2.5.2/cache
The jars for the packages stored in: /Users/jean-thomasmiquelot/.ivy2.5.2/jars
io.delta#delta-spark_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-6dcfe60c-adee-430e-9ca0-396a9e9c8da7;1.0
	confs: [default]
	found io.delta#delta-spark_2.13;4.0.0 in central
	found io.delta#delta-storage;4.0.0 in central
	found org.antlr#antlr4-runtime;4.13.1 in central
:: resolution report :: resolve 95ms :: artifacts dl 3ms
	:: modules in use:
	io.delta#delta-spark_2.13;4.0.0 from central in [default]
	io.delta#delta-storage;4.0.0 from central in [default]
	org.antlr#antlr4-runtime;4.13.1 from central in [default]
	---------------------------------------

Spark version : 4.0.1
Connecteur Delta chargé avec succès.


In [2]:
# definition of the streaming dataframe schema, with a 'corrupt_reccord' column in case of errors
schema_sensor = StructType(
    [
        StructField("timestamp", TimestampType(), True),
        StructField("device_id", StringType(), True),
        StructField("building", StringType(), True),
        StructField("floor", IntegerType(), True),
        StructField("type", StringType(), True),
        StructField("value", FloatType(), True),
        StructField("unit", StringType(), True),
        StructField("_corrupt_record", StringType(), True),
    ]
)

In [3]:
# definition of the differents paths with Path objects
INPUT_PATH = Path.cwd().parent / "data" / "sensor_data"
BRONZE_PATH = Path.cwd().parent / "data" / "out" / "delta_bronze"
CHECKPOINT_PATH = Path.cwd().parent / "data" / "out" / "checkpoint_bronze"


In [4]:
# definition of the streaming dataframe - lazy execution
df_stream = (
    spark.readStream.format("json")
    .schema(schema_sensor)
    .option("maxFilesPerTrigger", 1)
    .option("columnNameOfCorruptRecord", "_corrupt_record")
    .load(str(INPUT_PATH))
)

In [5]:
# we filter the Null values - lazy execution
df_stream_clean = df_stream.filter(
    col("timestamp").isNotNull()
    & col("device_id").isNotNull()
    & col("building").isNotNull()
    & col("floor").isNotNull()
    & col("type").isNotNull()
    & col("value").isNotNull()
    & col("unit").isNotNull()
)


In [6]:
# writing the clean streaming dataframe into a Delta table
query = (
    df_stream_clean.writeStream.format("delta")
    .outputMode("append")
    .option("checkpointLocation", str(CHECKPOINT_PATH))
    .option("path", str(BRONZE_PATH))
    .trigger(processingTime="1 second")
    .toTable("bronze_sensor_data")
)


25/12/17 14:07:27 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


In [7]:
bronze_delta = DeltaTable.forPath(spark, str(BRONZE_PATH))


In [8]:
delta_table = DeltaTable.forPath(spark, str(BRONZE_PATH))
delta_table.history().select(
    "version", "timestamp", "operation", "operationMetrics"
).show(50, truncate=False)

25/12/17 14:07:28 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

+-------+-----------------------+------------+----------------+
|version|timestamp              |operation   |operationMetrics|
+-------+-----------------------+------------+----------------+
|0      |2025-12-17 14:07:26.082|CREATE TABLE|{}              |
+-------+-----------------------+------------+----------------+



                                                                                

In [None]:
spark.sql("SELECT *  FROM bronze_sensor_data LIMIT 20").show()

                                                                                

+---------+---------+--------+-----+----+-----+----+---------------+
|timestamp|device_id|building|floor|type|value|unit|_corrupt_record|
+---------+---------+--------+-----+----+-----+----+---------------+
+---------+---------+--------+-----+----+-----+----+---------------+



25/12/17 14:07:34 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 1000} milliseconds, but spent 6671 milliseconds
25/12/17 14:07:37 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 1000} milliseconds, but spent 3291 milliseconds
25/12/17 14:07:39 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 1000} milliseconds, but spent 1985 milliseconds
25/12/17 14:07:41 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 1000} milliseconds, but spent 1827 milliseconds
25/12/17 14:07:42 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 1000} milliseconds, but spent 1321 milliseconds
25/12/17 14:07:43 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 1000} milliseconds, but spent 1378 milliseconds
25/12/17 14:07:46 WARN ProcessingTimeExecutor: Current batch is falling behind. The trig