# 📊 04 - Analyse de Données avec Spark
Ce notebook a pour objectif de fournir des données d'analyse à partir des données déjà transformées, nettoyées et enrichies.


In [1]:
# 0. Stoppe toute session existante
try:
    spark.stop()
except:
    pass

# Recréation SparkSession en local, FS local et driver binding fixe
from pyspark.sql import SparkSession
spark = (
    SparkSession.builder
        .appName("03_Enrichment")
        .master("local[*]")
        .config("spark.hadoop.fs.defaultFS", "file:///")
        .config("spark.driver.host", "127.0.0.1")
        .config("spark.driver.bindAddress", "0.0.0.0")
        .getOrCreate()
)


In [2]:
from pyspark.sql.functions import col, lower, trim, count
import os

# 1. Chargement
path = os.path.abspath(os.path.join(os.getcwd(), "../data/step3_enriched_csv"))
df_enriched = spark.read.option("header", "true").option("sep", ";").csv(path)

In [3]:
# Marques les plus fréquentes
top_brands = df_enriched.groupBy("brands") \
    .count() \
    .orderBy("count", ascending=False) \
    .filter(col("brands").isNotNull() & (col("brands") != "")) \
    .limit(30)

# Sauvegarde pour la visualisation
top_brands.coalesce(1) \
    .write.option("header", "true") \
    .mode("overwrite") \
    .csv("../data/viz/top_brands")

In [4]:
# Pays les plus fréquents
top_countries = df_enriched.groupBy("country") \
    .count() \
    .orderBy("count", ascending=False) \
    .filter(col("country").isNotNull() & (col("country") != "")) \
    .limit(20)

top_countries.coalesce(1) \
    .write.option("header", "true") \
    .mode("overwrite") \
    .csv("../data/viz/top_countries")

In [5]:
# 📊 Moyenne score_env_composite_flexible par pays (classés par fréquence d’apparition)
from pyspark.sql import functions as F

# Fréquence des pays
country_freq = df_enriched.groupBy("country").count()

# Moyenne des scores environnementaux
score_by_country = df_enriched.groupBy("country") \
    .agg(F.avg("score_env_composite_flexible").alias("avg_score_env_composite_flexible"))

# Jointure et tri par fréquence décroissante
result_country = score_by_country.join(country_freq, on="country") \
    .orderBy(F.desc("count"))

result_country.coalesce(1) \
    .write.option("header", "true") \
    .option("sep", ";") \
    .mode("overwrite") \
    .csv("../data/viz/score_env_by_country")


In [6]:
# Moyenne score_composite par pays (classés par fréquence d’apparition)

score_composite_by_country = df_enriched.groupBy("country") \
    .agg(F.avg("score_composite").alias("avg_score_composite"))

result_composite_country = score_composite_by_country.join(country_freq, on="country") \
    .orderBy(F.desc("count"))

result_composite_country.coalesce(1) \
    .write.option("header", "true") \
    .option("sep", ";") \
    .mode("overwrite") \
    .csv("../data/viz/score_composite_by_country")


In [7]:
# 🏷️ Moyenne score_env_composite_flexible par marque (classées par fréquence)

brand_freq = df_enriched.groupBy("brands").count()

score_by_brand = df_enriched.groupBy("brands") \
    .agg(F.avg("score_env_composite_flexible").alias("avg_score_env_composite_flexible"))

result_brand = score_by_brand.join(brand_freq, on="brands") \
    .orderBy(F.desc("count"))

result_brand.coalesce(1) \
    .write.option("header", "true") \
    .option("sep", ";") \
    .mode("overwrite") \
    .csv("../data/viz/score_env_by_brand")


In [8]:
# 🏷️ Moyenne score_composite par marque (classées par fréquence)

score_composite_by_brand = df_enriched.groupBy("brands") \
    .agg(F.avg("score_composite").alias("avg_score_composite"))

result_composite_brand = score_composite_by_brand.join(brand_freq, on="brands") \
    .orderBy(F.desc("count"))

result_composite_brand.coalesce(1) \
    .write.option("header", "true") \
    .option("sep", ";") \
    .mode("overwrite") \
    .csv("../data/viz/score_composite_by_brand")


In [None]:
from pyspark.sql.functions import col

# Étape 1 : Préparer les données et créer une vue temporaire
df_base = df_enriched.select(
    col("country"),
    col("brands"),
    col("score_env_composite_flexible").cast("double").alias("score_env_composite_flexible"),
    col("score_composite").cast("double").alias("score_composite"),
    col("energy-kcal_100g").cast("double").alias("energy_kcal_100g")
)

df_base.createOrReplaceTempView("df_base")

# Étape 2 : Requête SQL avec CUBE et GROUPING_ID
query = """
SELECT
    country,
    brands,
    AVG(score_env_composite_flexible) AS avg_env_score,
    AVG(score_composite) AS avg_composite_score,
    AVG(energy_kcal_100g) AS avg_kcal,
    GROUPING(country) AS grouping_country,
    GROUPING(brands) AS grouping_brands,
    GROUPING_ID(country, brands) AS grouping_id
FROM df_base
GROUP BY CUBE(country, brands)
ORDER BY grouping_country, grouping_brands
"""

df_grouped = spark.sql(query)

# Étape 3 : Affichage
df_grouped.show(truncate=False)

df_grouped.coalesce(1) \
    .write.option("header", "true") \
    .option("sep", ";") \
    .mode("overwrite") \
    .csv("../data/viz/cube_agg_country_brand_scores")


+-------------+-----------------------------+--------------------+-------------------+------------------+----------------+---------------+-----------+
|country      |brands                       |avg_env_score       |avg_composite_score|avg_kcal          |grouping_country|grouping_brands|grouping_id|
+-------------+-----------------------------+--------------------+-------------------+------------------+----------------+---------------+-----------+
|france       |inkospor                     |2.3                 |1.8                |302.6779661016949 |0               |0              |0          |
|unitedstates |arrowhead                    |null                |2.771428571428572  |0.0               |0               |0              |0          |
|italy        |king island                  |3.5                 |null               |14.0              |0               |0              |0          |
|france       |verrière traiteur            |-0.45454545454545453|1.7684210526315787 |166.95  