# DE1 — Final Project Notebook
> Author : Couzinet Lorenzo & Rabahi Enzo 

**Academic year:** 2025–2026  
**Program:** Data & Applications - Engineering - (FD)   
**Course:** Data Engineering I  

---

This is the primary executable artifact. Fill config, run baseline, then optimized pipeline, and record evidence.

DataSet utilisé: https://www.kaggle.com/datasets/sobhanmoosavi/us-accidents

## 0. Load config

In [None]:
import yaml, pathlib, datetime
from pyspark.sql import SparkSession, functions as F, types as T
import os

# Force Spark à utiliser l'adresse locale (localhost) pour éviter les erreurs réseaux
os.environ["SPARK_LOCAL_IP"] = "127.0.0.1"
os.environ["OBJC_DISABLE_INITIALIZE_FORK_SAFETY"] = "YES"

with open("de1_project_config.yml") as f:
    CFG = yaml.safe_load(f)

spark = SparkSession.builder \
    .appName("DE1-Project-Lakehouse") \
    .config("spark.driver.host", "127.0.0.1") \
    .config("spark.sql.sources.partitionOverwriteMode", "dynamic") \
    .getOrCreate()

# Création des dossiers de sortie s'ils n'existent pas
for path in [CFG['output_path'], CFG['proof_path']]:
    pathlib.Path(path).mkdir(parents=True, exist_ok=True)
 
print(f"Config loaded. Output base: {CFG['output_path']}")

26/01/04 11:48:54 WARN SparkContext: Another SparkContext is being constructed (or threw an exception in its constructor). This may indicate an error, since only one SparkContext should be running in this JVM (see SPARK-2243). The other SparkContext was created at:
org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:59)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:77)
java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:499)
java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:480)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
py4j.Gateway.invoke(Gateway.java:238)
py4j.command

Py4JJavaError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext.
: java.net.BindException: Can't assign requested address: Service 'sparkDriver' failed after 16 retries (on a random free port)! Consider explicitly setting the appropriate binding address for the service 'sparkDriver' (for example spark.driver.bindAddress for SparkDriver) to the correct binding address.
	at java.base/sun.nio.ch.Net.bind0(Native Method)
	at java.base/sun.nio.ch.Net.bind(Net.java:555)
	at java.base/sun.nio.ch.ServerSocketChannelImpl.netBind(ServerSocketChannelImpl.java:337)
	at java.base/sun.nio.ch.ServerSocketChannelImpl.bind(ServerSocketChannelImpl.java:294)
	at io.netty.channel.socket.nio.NioServerSocketChannel.doBind(NioServerSocketChannel.java:141)
	at io.netty.channel.AbstractChannel$AbstractUnsafe.bind(AbstractChannel.java:561)
	at io.netty.channel.DefaultChannelPipeline$HeadContext.bind(DefaultChannelPipeline.java:1281)
	at io.netty.channel.AbstractChannelHandlerContext.invokeBind(AbstractChannelHandlerContext.java:600)
	at io.netty.channel.AbstractChannelHandlerContext.bind(AbstractChannelHandlerContext.java:579)
	at io.netty.channel.DefaultChannelPipeline.bind(DefaultChannelPipeline.java:922)
	at io.netty.channel.AbstractChannel.bind(AbstractChannel.java:259)
	at io.netty.bootstrap.AbstractBootstrap$2.run(AbstractBootstrap.java:380)
	at io.netty.util.concurrent.AbstractEventExecutor.runTask(AbstractEventExecutor.java:173)
	at io.netty.util.concurrent.AbstractEventExecutor.safeExecute(AbstractEventExecutor.java:166)
	at io.netty.util.concurrent.SingleThreadEventExecutor.runAllTasks(SingleThreadEventExecutor.java:472)
	at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:569)
	at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:998)
	at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
	at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
	at java.base/java.lang.Thread.run(Thread.java:833)


## 1. Bronze — landing raw data

In [None]:
raw_glob = CFG["paths"]["raw_csv_glob"]
bronze = CFG["paths"]["bronze"]
proof = CFG["paths"]["proof"]

df_raw = (spark.read.option("header","true").csv(raw_glob))
df_raw.write.mode("overwrite").csv(bronze)  # keep raw as CSV copy
print("Bronze written:", bronze)

# Ajout des colonnes d'audit : date d'ingestion et fichier source
df_bronze_enhanced = df_raw \
    .withColumn("_ingested_at", F.current_timestamp()) \
    .withColumn("_source_file", F.input_file_name())

# Écriture de la version enrichie ou simple vérification
print(f"Bronze enrichi généré. Aperçu des métadonnées :")
df_bronze_enhanced.select("_ingested_at", "_source_file").show(1, truncate=False)


KeyError: 'paths'

## 2. Silver — cleaning and typing

In [None]:
silver = CFG["paths"]["silver"]

# Example typing; adapt to dataset
from pyspark.sql import functions as F, types as T
df_silver = (df_raw
    .withColumn("metric", F.col("metric").cast("double"))
    .withColumn("date", F.to_date("date"))
    .dropna(subset=["metric","date"]))

df_silver.write.mode("overwrite").parquet(silver)
print("Silver written:", silver)


## 3. Gold — analytics tables

In [None]:
gold = CFG["paths"]["gold"]
partition_by = CFG["layout"]["partition_by"]

# Example gold Q1
gold_q1 = (df_silver.groupBy("date").agg(F.sum("metric").alias("sum_metric")))
(gold_q1.write.mode("overwrite").partitionBy(*partition_by).parquet(f"{gold}/q1_daily"))

print("Gold written:", gold)


## 4. Baseline plans and metrics

In [None]:
import os, datetime as _dt, pathlib
pathlib.Path(proof).mkdir(parents=True, exist_ok=True)

# Example baseline plan
plan = gold_q1._jdf.queryExecution().executedPlan().toString()
with open(f"{proof}/baseline_q1_plan.txt","w") as f:
    f.write(str(_dt.datetime.now())+"\n")
    f.write(plan)
print("Saved baseline plan. Record Spark UI metrics now.")


## 5. Optimization — layout and joins

In [None]:
# Example: narrow projection and pre‑aggregation before write
df_silver_min = df_silver.select("date","metric")
gold_q1_opt = (df_silver_min.groupBy("date").agg(F.sum("metric").alias("sum_metric")))
gold_q1_opt.write.mode("overwrite").partitionBy(*partition_by).parquet(f"{gold}/q1_daily_opt")

plan_opt = gold_q1_opt._jdf.queryExecution().executedPlan().toString()
with open(f"{proof}/optimized_q1_plan.txt","w") as f:
    f.write(str(_dt.datetime.now())+"\n")
    f.write(plan_opt)
print("Saved optimized plan. Record Spark UI metrics now.")


## 6. Cleanup

In [7]:
spark.stop()
print("Spark session stopped.")


Spark session stopped.
