# üìä 04 - Analyse de Donn√©es avec Spark
Ce notebook a pour objectif de fournir des donn√©es d'analyse √† partir des donn√©es d√©j√† transform√©es, nettoy√©es et enrichies.


In [1]:


# 0. Stoppe toute session existante
try:
    spark.stop()
except:
    pass

# 1. Recr√©ation SparkSession avec 16 Go de RAM allou√©s
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
        .appName("04_analyse")
        .master("local[*]")  # Utilise tous les c≈ìurs disponibles
        .config("spark.hadoop.fs.defaultFS", "file:///")
        .config("spark.driver.host", "127.0.0.1")
        .config("spark.driver.bindAddress", "0.0.0.0")
        .config("spark.driver.memory", "16g")         # <-- alloue 16 Go au driver Spark
        .config("spark.executor.memory", "16g")       # <-- alloue 16 Go aux t√¢ches ex√©cut√©es (optionnel en local)
        .config("spark.sql.shuffle.partitions", "8")  # <-- limite les partitions pour limiter le thread/m√©moire
        .getOrCreate()
)


In [2]:
from pyspark.sql.functions import col, lower, trim, count
import os

# 1. Chargement
path = os.path.abspath(os.path.join(os.getcwd(), "../data/step3_enriched_csv"))
df_enriched = spark.read.option("header", "true").option("sep", ";").csv(path)

In [3]:
# Marques les plus fr√©quentes
top_brands = df_enriched.groupBy("brands") \
    .count() \
    .orderBy("count", ascending=False) \
    .filter(col("brands").isNotNull() & (col("brands") != "")) \
    .limit(30)

# Sauvegarde pour la visualisation
top_brands.coalesce(1) \
    .write.option("header", "true") \
    .mode("overwrite") \
    .csv("../data/viz/top_brands")

In [4]:
# Pays les plus fr√©quents
top_countries = df_enriched.groupBy("country") \
    .count() \
    .orderBy("count", ascending=False) \
    .filter(col("country").isNotNull() & (col("country") != "")) \
    .limit(20)

top_countries.coalesce(1) \
    .write.option("header", "true") \
    .mode("overwrite") \
    .csv("../data/viz/top_countries")

In [5]:
# üìä Moyenne score_env_composite_flexible (impact environnementale) par pays (class√©s par fr√©quence d‚Äôapparition)
from pyspark.sql import functions as F

# Fr√©quence des pays
country_freq = df_enriched.groupBy("country").count()

# Moyenne des scores environnementaux
score_by_country = df_enriched.groupBy("country") \
    .agg(F.avg("score_env_composite_flexible").alias("avg_score_env_composite_flexible"))

# Jointure et tri par fr√©quence d√©croissante
result_country = score_by_country.join(country_freq, on="country") \
    .orderBy(F.desc("count"))

result_country.coalesce(1) \
    .write.option("header", "true") \
    .option("sep", ";") \
    .mode("overwrite") \
    .csv("../data/viz/score_env_by_country")


In [6]:
# Moyenne score_composite (aliments sains) par pays (class√©s par fr√©quence d‚Äôapparition) 

score_composite_by_country = df_enriched.groupBy("country") \
    .agg(F.avg("score_composite").alias("avg_score_composite"))

result_composite_country = score_composite_by_country.join(country_freq, on="country") \
    .orderBy(F.desc("count"))

result_composite_country.coalesce(1) \
    .write.option("header", "true") \
    .option("sep", ";") \
    .mode("overwrite") \
    .csv("../data/viz/score_composite_by_country")


In [7]:
# üè∑Ô∏è Moyenne score_env_composite_flexible par marque (class√©es par fr√©quence)

brand_freq = df_enriched.groupBy("brands").count()

score_by_brand = df_enriched.groupBy("brands") \
    .agg(F.avg("score_env_composite_flexible").alias("avg_score_env_composite_flexible"))

result_brand = score_by_brand.join(brand_freq, on="brands") \
    .orderBy(F.desc("count"))

result_brand.coalesce(1) \
    .write.option("header", "true") \
    .option("sep", ";") \
    .mode("overwrite") \
    .csv("../data/viz/score_env_by_brand")


In [8]:
# üè∑Ô∏è Moyenne score_composite par marque (class√©es par fr√©quence)

score_composite_by_brand = df_enriched.groupBy("brands") \
    .agg(F.avg("score_composite").alias("avg_score_composite"))

result_composite_brand = score_composite_by_brand.join(brand_freq, on="brands") \
    .orderBy(F.desc("count"))

result_composite_brand.coalesce(1) \
    .write.option("header", "true") \
    .option("sep", ";") \
    .mode("overwrite") \
    .csv("../data/viz/score_composite_by_brand")


In [9]:
from pyspark.sql.functions import col, avg , trim, lower

# üß± √âtape 1 : Base pour les moyennes
df_base = df_enriched.select(
    col("country"),
    col("brands"),
    col("score_env_composite_flexible").cast("double").alias("score_env_composite_flexible"),
    col("score_composite").cast("double").alias("score_composite"),
    col("energy-kcal_100g").cast("double").alias("energy_kcal_100g")
)
df_base.createOrReplaceTempView("df_base")

# üìä √âtape 2 : CUBE des moyennes
query_cube = """
SELECT
    country,
    brands,
    AVG(score_env_composite_flexible) AS avg_env_score,
    AVG(score_composite) AS avg_composite_score,
    AVG(energy_kcal_100g) AS avg_kcal,
    GROUPING(country) AS grouping_country,
    GROUPING(brands) AS grouping_brands,
    GROUPING_ID(country, brands) AS grouping_id
FROM df_base
GROUP BY CUBE(country, brands)
"""
df_grouped = spark.sql(query_cube)

# ‚ûï √âtape 3 : Proportions nutritionnelles
df_flags = df_enriched.select(
    col("country"), col("brands"),
    col("is_vegan").cast("int"),
    col("is_vegetarian").cast("int"),
    col("is_sans_sucre").cast("int"),
    col("is_protein_plus").cast("int"),
    col("is_light").cast("int"),
    col("is_ultra_transformed").cast("int")
)

df_props = df_flags.groupBy("country", "brands").agg(
    avg("is_vegan").alias("prop_vegan"),
    avg("is_vegetarian").alias("prop_vegetarian"),
    avg("is_sans_sucre").alias("prop_sans_sucre"),
    avg("is_protein_plus").alias("prop_protein_plus"),
    avg("is_light").alias("prop_light"),
    avg("is_ultra_transformed").alias("prop_ultra_transformed")
)

# üéØ √âtape 4 : Bool√©ens "majoritaires"
threshold = 0.5
df_props = df_props \
    .withColumn("is_majority_vegan", col("prop_vegan") >= threshold) \
    .withColumn("is_majority_vegetarian", col("prop_vegetarian") >= threshold) \
    .withColumn("is_majority_sans_sucre", col("prop_sans_sucre") >= threshold) \
    .withColumn("is_majority_protein_plus", col("prop_protein_plus") >= threshold) \
    .withColumn("is_majority_light", col("prop_light") >= threshold) \
    .withColumn("is_majority_ultra_transformed", col("prop_ultra_transformed") >= threshold)

# üîó √âtape 5 : Jointure avec le CUBE
df_final = df_grouped.join(df_props, on=["country", "brands"], how="left")

# üíæ √âtape 6 : Export
df_final.coalesce(1) \
    .write.option("header", "true") \
    .option("sep", ";") \
    .mode("overwrite") \
    .csv("../data/viz/cube_agg_country_brand_scores_enriched")