# Atelier Intégration des Données - OpenFoodFacts

Ce notebook implémente la chaîne ETL complète (Bronze -> Silver -> Gold) avec **PySpark**.
Il charge les données depuis un fichier JSONL, les nettoie, et les charge dans un Datamart MySQL.

In [None]:
# 1. Imports & Configuration
import sys
import os
import logging
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Window
from pyspark.sql.types import StructType, StructField, StringType, LongType, DoubleType, ArrayType, MapType

# Configuration Logger
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s')
logger = logging.getLogger("OFF_Notebook")

# Spark Session
spark = SparkSession.builder \
    .appName("OFF_Workshop_Notebook") \
    .config("spark.sql.session.timeZone", "UTC") \
    .config("spark.driver.extraClassPath", "mysql-connector-j-8.0.33.jar") \ 
    .getOrCreate()

logger.info("Spark Session Created")

In [None]:
# 2. Configuration & Paramètres
# A REMPLIR : Informations de connexion MySQL
db_props = {
    "user": "root",
    "password": "password",  # Changez moi
    "driver": "com.mysql.cj.jdbc.Driver"
}
jdbc_url = "jdbc:mysql://localhost:3306/off_datamart"

# Chemins des données (Utilisation de dossiers relatifs)
# On remonte d'un niveau car le notebook est dans le dossier 'projet'
base_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
raw_input_path = os.path.join(base_dir, "tests", "sample_data.jsonl")
bronze_path = os.path.join(base_dir, "data", "bronze")
silver_path = os.path.join(base_dir, "data", "silver")

logger.info(f"Input: {raw_input_path}")

## 3. Bronze Layer (Ingestion)

In [None]:
def get_bronze_schema():
    nutriments_schema = StructType([
        StructField("energy-kcal_100g", DoubleType(), True),
        StructField("sugars_100g", DoubleType(), True),
        StructField("fat_100g", DoubleType(), True),
        StructField("saturated-fat_100g", DoubleType(), True),
        StructField("salt_100g", DoubleType(), True),
        StructField("sodium_100g", DoubleType(), True),
        StructField("proteins_100g", DoubleType(), True),
        StructField("fiber_100g", DoubleType(), True)
    ])

    schema = StructType([
        StructField("code", StringType(), True),
        StructField("product_name", StringType(), True),
        StructField("brands", StringType(), True),
        StructField("categories_tags", ArrayType(StringType()), True),
        StructField("countries_tags", ArrayType(StringType()), True),
        StructField("nutriscore_grade", StringType(), True),
        StructField("nutriments", nutriments_schema, True),
        StructField("last_modified_t", LongType(), True)
    ])
    return schema

# Ingestion
logger.info("Reading Raw Data...")
df_raw = spark.read.schema(get_bronze_schema()).json(raw_input_path)
df_raw.write.mode("overwrite").parquet(bronze_path)
logger.info("Bronze Layer Written.")

## 4. Silver Layer (Conformation)

In [None]:
# Transformations Silver
logger.info("Processing Silver Layer...")
df_bronze = spark.read.parquet(bronze_path)

# 1. Dédoublonnage (Keep latest last_modified_t)
window_spec = Window.partitionBy("code").orderBy(F.col("last_modified_t").desc())
df_dedup = df_bronze.withColumn("rn", F.row_number().over(window_spec)) \
                    .filter(F.col("rn") == 1) \
                    .drop("rn")

# 2. Normalisation (Tags & Units)
df_silver = df_dedup.withColumn("countries_normalized", F.expr("transform(countries_tags, x -> regexp_replace(x, '^..:', ''))")) \
                    .withColumn("categories_normalized", F.expr("transform(categories_tags, x -> regexp_replace(x, '^..:', ''))")) \
                    .withColumn("nutriments.salt_100g", F.coalesce(F.col("nutriments.salt_100g"), F.col("nutriments.sodium_100g") * 2.5))

df_silver.write.mode("overwrite").parquet(silver_path)
logger.info("Silver Layer Written.")

## 5. Gold Layer (Datamart Loading)

In [None]:
# Chargement Dimensions Simples
logger.info("Loading Dimensions...")
df_s = spark.read.parquet(silver_path)

# Brand
brands = df_s.select("brands").where(F.col("brands").isNotNull()).distinct().withColumnRenamed("brands", "brand_name")

try:
    existing_brands = spark.read.jdbc(jdbc_url, "dim_brand", properties=db_props)
    new_brands = brands.join(existing_brands, on="brand_name", how="left_anti")
    if new_brands.count() > 0:
        new_brands.write.jdbc(jdbc_url, "dim_brand", mode="append", properties=db_props)
    logger.info("Dimensions Loaded.")
except Exception as e:
    logger.warning(f"DB Error (Check connection): {e}")

In [None]:
# Chargement Produit (SCD2)
logger.info("Loading Products SCD2...")
# Hashing pour détection de changement
track_cols = ["product_name", "brands", "categories_tags", "countries_tags", "nutriscore_grade"]
concat_expr = F.concat_ws("||", *[F.coalesce(F.col(c).cast("string"), F.lit("")) for c in track_cols])
df_input = df_s.withColumn("row_hash", F.md5(concat_expr))

try:
    dbtable = "(SELECT product_sk, code, row_hash as current_hash FROM dim_product WHERE is_current = 1) as t"
    df_active = spark.read.jdbc(jdbc_url, dbtable, properties=db_props)
    df_joined = df_input.join(df_active, "code", "left")
    df_new_or_changed = df_joined.filter(F.col("product_sk").isNull() | (F.col("row_hash") != F.col("current_hash")))
    
    if df_new_or_changed.count() > 0:
        df_final = df_new_or_changed.select(
            F.col("code"),
            F.col("product_name"),
            F.lit(1).alias("is_current"),
            F.current_timestamp().alias("effective_from"),
            F.col("row_hash")
        )
        df_final.write.jdbc(jdbc_url, "dim_product", mode="append", properties=db_props)
        logger.info("Products Updated.")
    else:
        logger.info("No product changes.")
except Exception as e:
    logger.warning(f"Product Load Error: {e}")

## 6. Qualité & Reporting

In [None]:
total = df_s.count()
with_score = df_s.filter(F.col("nutriscore_grade").isNotNull()).count()
completeness = (with_score / total) * 100 if total > 0 else 0

print(f"=== RAPPORT DE QUALITÉ ===")
print(f"Produits traités : {total}")
print(f"Complétude Nutri-Score : {completeness:.2f}%")
print(f"=========================")