# DE1 — Final Project Notebook
> Author : Badr TAJINI - Data Engineering I - ESIEE 2025-2026
---

This is the primary executable artifact. Fill config, run baseline, then optimized pipeline, and record evidence.

## 0. Load config

In [1]:
import yaml
from pyspark.sql import SparkSession, functions as F

#le YAML est dans le même dossier que le notebook
with open("de1_project_config.yml") as f:
    CFG = yaml.safe_load(f)

spark = SparkSession.builder.appName("de1-project").getOrCreate()

CFG


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
26/01/04 14:54:58 WARN Utils: Your hostname, Rana, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
26/01/04 14:54:58 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/04 14:54:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


{'paths': {'raw_tsv_glob': 'data/raw/*.tsv',
  'outputs_root': 'outputs/project',
  'bronze': 'outputs/project/bronze',
  'silver': 'outputs/project/silver',
  'gold': 'outputs/project/gold',
  'proof': 'proof',
  'metrics_log': 'project_metrics_log.csv'},
 'layout': {'target_file_size_mb': 128, 'num_partitions': 200},
 'queries': {'q1': {'name': 'Top pages by total clicks',
   'sql': 'SELECT\n  source_page AS page,\n  SUM(click_count) AS total_clicks\nFROM silver\nGROUP BY source_page\nORDER BY total_clicks DESC\nLIMIT 100\n'},
  'q2': {'name': 'Top targets per source',
   'sql': 'SELECT\n  source_page,\n  target_page,\n  SUM(click_count) AS total_clicks\nFROM silver\nGROUP BY source_page, target_page\nORDER BY total_clicks DESC\nLIMIT 100\n'},
  'q3': {'name': 'Top link types',
   'sql': 'SELECT\n  link_type,\n  COUNT(*) AS edges,\n  SUM(click_count) AS total_clicks\nFROM silver\nGROUP BY link_type\nORDER BY total_clicks DESC\nLIMIT 100\n'}}}

In [2]:
spark.sparkContext.setLogLevel("ERROR")   # ou "WARN"


## 1. Bronze — landing raw data

In [3]:
import os
from pathlib import Path

raw_glob = CFG["paths"]["raw_tsv_glob"]   
bronze   = CFG["paths"]["bronze"]
proof    = CFG["paths"]["proof"]

df_raw = (
    spark.read
    .option("header", "false")   
    .option("sep", "\t")         
    .csv(raw_glob)
)

df_raw.write.mode("overwrite").csv(bronze)  
print("Bronze written to:", bronze)



                                                                                

Bronze written to: outputs/project/bronze


## 2. Silver — cleaning and typing

In [4]:
# 2. Silver — cleaning and typing (ADAPTÉ AU DATASET)

silver = CFG["paths"]["silver"]

from pyspark.sql import functions as F

df_silver = (
    df_raw
    .withColumnRenamed("_c0", "source_page")
    .withColumnRenamed("_c1", "target_page")
    .withColumnRenamed("_c2", "link_type")
    .withColumnRenamed("_c3", "click_count")
    .withColumn("click_count", F.col("click_count").cast("int"))
    .filter(F.col("click_count").isNotNull())
    .filter(F.col("click_count") >= 0)
    .filter(F.length(F.col("source_page")) > 0)
    .filter(F.length(F.col("target_page")) > 0)
    .dropDuplicates()
)

silver_count = df_silver.count()
df_silver.write.mode("overwrite").parquet(silver)

print(f"Silver written: {silver}, rows: {silver_count:,}")







                                                                                

Silver written: outputs/project/silver, rows: 6,072,131


## 3. Gold — analytics tables

In [5]:
# 3. Gold — analytics tables

gold = CFG["paths"]["gold"]
queries = CFG["queries"]

import pathlib
pathlib.Path(gold).mkdir(parents=True, exist_ok=True)

# Register silver as SQL table
df_silver.createOrReplaceTempView("silver")

# Q1 — Top destination pages
df_q1 = spark.sql(queries["q1"]["sql"])
q1_count = df_q1.count()
df_q1.write.mode("overwrite").parquet(f"{gold}/q1_top_pages")
print(f"Q1 written, rows: {q1_count:,}")

# Q2 — Top transitions
df_q2 = spark.sql(queries["q2"]["sql"])
q2_count = df_q2.count()
df_q2.write.mode("overwrite").parquet(f"{gold}/q2_top_transitions")
print(f"Q2 written, rows: {q2_count:,}")

# Q3 — High traffic links
df_q3 = spark.sql(queries["q3"]["sql"])
q3_count = df_q3.count()
df_q3.write.mode("overwrite").parquet(f"{gold}/q3_high_traffic")
print(f"Q3 written, rows: {q3_count:,}")

print("Gold written:", gold)


                                                                                

Q1 written, rows: 100


[Stage 27:>                                                       (0 + 16) / 17]



[Stage 34:>                                                       (0 + 16) / 17]







                                                                                

Q2 written, rows: 100


                                                                                

Q3 written, rows: 3
Gold written: outputs/project/gold


In [6]:
print("RUN Q1")
df_q1 = spark.sql(queries["q1"]["sql"])
df_q1.count()


RUN Q1


                                                                                

100

In [7]:
print("RUN Q2")
df_q2 = spark.sql(queries["q2"]["sql"])
df_q2.count()


RUN Q2


[Stage 60:>                                                       (0 + 16) / 17]



                                                                                

100

In [8]:
print("RUN Q3")
df_q3 = spark.sql(queries["q3"]["sql"])
df_q3.count()

RUN Q3


                                                                                

3

## 4. Baseline plans and metrics

In [9]:

# 4. Baseline plans and metrics (save plans to proof/)
import datetime
import pathlib

proof = CFG["paths"]["proof"]
pathlib.Path(proof).mkdir(parents=True, exist_ok=True)

def save_plan(df, name):
    plan = df._jdf.queryExecution().executedPlan().toString()
    out = f"{proof}/baseline_{name}_plan.txt"
    with open(out, "w", encoding="utf-8") as f:
        f.write(str(datetime.datetime.now()) + "\n")
        f.write(plan)
    print("Saved:", out)

# IMPORTANT: on utilise les df déjà calculés en section 3 (gold)
save_plan(df_q1, "q1")
save_plan(df_q2, "q2")
save_plan(df_q3, "q3")

print("Saved baseline plans. Now open Spark UI ")


Saved: proof/baseline_q1_plan.txt
Saved: proof/baseline_q2_plan.txt
Saved: proof/baseline_q3_plan.txt
Saved baseline plans. Now open Spark UI 


## 5. Optimization — layout and joins

In [11]:


import datetime, pathlib
from pyspark.sql import functions as F

silver = CFG["paths"]["silver"]
proof  = CFG["paths"]["proof"]
gold   = CFG["paths"]["gold"]
queries = CFG["queries"]
layout = CFG.get("layout", {})

pathlib.Path(proof).mkdir(parents=True, exist_ok=True)
pathlib.Path(gold).mkdir(parents=True, exist_ok=True)


df_silver_reloaded = spark.read.parquet(silver)


num_partitions = int(layout.get("num_partitions", 200))

df_silver_opt = (df_silver_reloaded
    .repartition(num_partitions)
    .sortWithinPartitions(F.desc("click_count"))
)

silver_opt = f"{silver}_optimized"
df_silver_opt.write.mode("overwrite").parquet(silver_opt)
print(f"Optimized silver written: {silver_opt}")


df_silver_opt.createOrReplaceTempView("silver")


df_q1_opt = spark.sql(queries["q1"]["sql"])
plan_q1_opt = df_q1_opt._jdf.queryExecution().executedPlan().toString()
with open(f"{proof}/optimized_q1_plan.txt", "w") as f:
    f.write(str(datetime.datetime.now()) + "\n")
    f.write(f"Optimization: repartition({num_partitions}) + sortWithinPartitions(click_count desc)\n")
    f.write(plan_q1_opt)

df_q2_opt = spark.sql(queries["q2"]["sql"])
plan_q2_opt = df_q2_opt._jdf.queryExecution().executedPlan().toString()
with open(f"{proof}/optimized_q2_plan.txt", "w") as f:
    f.write(str(datetime.datetime.now()) + "\n")
    f.write(f"Optimization: repartition({num_partitions}) + sortWithinPartitions(click_count desc)\n")
    f.write(plan_q2_opt)

df_q3_opt = spark.sql(queries["q3"]["sql"])
plan_q3_opt = df_q3_opt._jdf.queryExecution().executedPlan().toString()
with open(f"{proof}/optimized_q3_plan.txt", "w") as f:
    f.write(str(datetime.datetime.now()) + "\n")
    f.write(f"Optimization: repartition({num_partitions}) + sortWithinPartitions(click_count desc)\n")
    f.write(plan_q3_opt)

print("Saved optimized plans. Now run Q1-Q3 (optimized) and record Spark UI metrics.")


                                                                                

Optimized silver written: outputs/project/silver_optimized
Saved optimized plans. Now run Q1-Q3 (optimized) and record Spark UI metrics.


In [12]:
print("RUN Q1 OPT"); spark.sql(queries["q1"]["sql"]).count()
print("RUN Q2 OPT"); spark.sql(queries["q2"]["sql"]).count()
print("RUN Q3 OPT"); spark.sql(queries["q3"]["sql"]).count()


RUN Q1 OPT


                                                                                

RUN Q2 OPT


                                                                                

RUN Q3 OPT


                                                                                

3

## 6. Cleanup

In [13]:
spark.stop()
print("Spark session stopped.")


Spark session stopped.
