# 🧹 02 - Nettoyage des données OpenFoodFacts
Ce notebook a pour objectif de charger les données brutes transformées en CSV et de nettoyer les colonnes inutiles, mal remplies ou trop bruitées.


In [1]:
# 0. Stoppe toute session existante
try:
    spark.stop()
except:
    pass

# 1. Recréation SparkSession en local, FS local et driver binding fixe
from pyspark.sql import SparkSession
spark = (
    SparkSession.builder
        .appName("NettoyageData")
        .master("local[*]")
        .config("spark.hadoop.fs.defaultFS", "file:///")
        .config("spark.driver.host", "127.0.0.1")
        .config("spark.driver.bindAddress", "0.0.0.0")
    .getOrCreate()
)


In [2]:
# 📥 Chargement des données depuis le fichier CSV 
input_path = "../data/step1_raw_csv"

df_cleaned  = (spark.read
    .option("header", "true")       # utilise la première ligne comme noms de colonnes
    .option("inferSchema", "true")  # déduit automatiquement les types
    .option("sep", ";")             # <— ici, on spécifie le point-virgule comme séparateur
    .csv(input_path)
)

# Vérifie que les colonnes sont bien nommées
df_cleaned.printSchema()

print(f"✅ Données chargées : {df_cleaned.count():,} lignes, {len(df_cleaned.columns)} colonnes")


root
 |-- code: double (nullable = true)
 |-- url: string (nullable = true)
 |-- creator: string (nullable = true)
 |-- created_t: integer (nullable = true)
 |-- created_datetime: timestamp (nullable = true)
 |-- last_modified_t: integer (nullable = true)
 |-- last_modified_datetime: timestamp (nullable = true)
 |-- last_modified_by: string (nullable = true)
 |-- last_updated_t: integer (nullable = true)
 |-- last_updated_datetime: timestamp (nullable = true)
 |-- product_name: string (nullable = true)
 |-- abbreviated_product_name: string (nullable = true)
 |-- generic_name: string (nullable = true)
 |-- quantity: string (nullable = true)
 |-- packaging: string (nullable = true)
 |-- packaging_tags: string (nullable = true)
 |-- packaging_en: string (nullable = true)
 |-- packaging_text: string (nullable = true)
 |-- brands: string (nullable = true)
 |-- brands_tags: string (nullable = true)
 |-- brands_en: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- catego

In [3]:
# 🧾 Aperçu des colonnes initiales
for i, col_name in enumerate(df_cleaned.columns, start=1):
    print(f"{i:02d}. {col_name}")

01. code
02. url
03. creator
04. created_t
05. created_datetime
06. last_modified_t
07. last_modified_datetime
08. last_modified_by
09. last_updated_t
10. last_updated_datetime
11. product_name
12. abbreviated_product_name
13. generic_name
14. quantity
15. packaging
16. packaging_tags
17. packaging_en
18. packaging_text
19. brands
20. brands_tags
21. brands_en
22. categories
23. categories_tags
24. categories_en
25. origins
26. origins_tags
27. origins_en
28. manufacturing_places
29. manufacturing_places_tags
30. labels
31. labels_tags
32. labels_en
33. emb_codes
34. emb_codes_tags
35. first_packaging_code_geo
36. cities
37. cities_tags
38. purchase_places
39. stores
40. countries
41. countries_tags
42. countries_en
43. ingredients_text
44. ingredients_tags
45. ingredients_analysis_tags
46. allergens
47. allergens_en
48. traces
49. traces_tags
50. traces_en
51. serving_size
52. serving_quantity
53. no_nutrition_data
54. additives_n
55. additives
56. additives_tags
57. additives_en
58. 

In [4]:
# 🧼 Suppression des colonnes inutiles ou trop bruitées
colonnes_a_supprimer = [
    # 🔧 Métadonnées techniques et timestamps
    "code", "url", "creator", "created_t", "created_datetime",
    "last_modified_t", "last_modified_datetime", "last_modified_by",
    "last_updated_t", "last_updated_datetime", "owner",

    # 🗂️ Champs redondants multilingues
    "brands_tags", "brands_en", "categories_tags", "categories_en",
    "labels_tags", "labels_en", "origins_tags", "origins_en",
    "packaging_tags", "packaging_en", "packaging_text",
    "states", "states_tags", "states_en", "countries_tags", "countries_en",
    "food_groups_tags", "food_groups_en", "main_category_en",
    
    # 🧭 Infos géographiques ultra-locales ou rarement remplies
    "cities", "cities_tags", "emb_codes", "emb_codes_tags", "first_packaging_code_geo",
    "manufacturing_places_tags",

    # 🔍 Qualité de données / scan peu utile pour l’analyse de contenu
    "data_quality_errors_tags",
    # 🔍 Autres
    "pnns_groups_1","pnns_groups_2","unique_scans_n",

    # 📸 Données image
    "image_url", "image_small_url", "image_ingredients_url", "image_ingredients_small_url",
    "image_nutrition_url", "image_nutrition_small_url", "last_image_t", "last_image_datetime"
]


df_colonnes_cleaned = df_cleaned.drop(*[c for c in colonnes_a_supprimer if c in df_cleaned.columns])
print(f"✅ Colonnes après nettoyage : {len(df_colonnes_cleaned.columns)}")

# 📝 Aperçu des colonnes restantes
for i, col_name in enumerate(df_colonnes_cleaned.columns, start=1):
    print(f"{i:02d}. {col_name}")

✅ Colonnes après nettoyage : 161
01. product_name
02. abbreviated_product_name
03. generic_name
04. quantity
05. packaging
06. brands
07. categories
08. origins
09. manufacturing_places
10. labels
11. purchase_places
12. stores
13. countries
14. ingredients_text
15. ingredients_tags
16. ingredients_analysis_tags
17. allergens
18. allergens_en
19. traces
20. traces_tags
21. traces_en
22. serving_size
23. serving_quantity
24. no_nutrition_data
25. additives_n
26. additives
27. additives_tags
28. additives_en
29. nutriscore_score
30. nutriscore_grade
31. nova_group
32. food_groups
33. brand_owner
34. environmental_score_score
35. environmental_score_grade
36. nutrient_levels_tags
37. product_quantity
38. popularity_tags
39. completeness
40. main_category
41. energy-kj_100g
42. energy-kcal_100g
43. energy_100g
44. energy-from-fat_100g
45. fat_100g
46. saturated-fat_100g
47. butyric-acid_100g
48. caproic-acid_100g
49. caprylic-acid_100g
50. capric-acid_100g
51. lauric-acid_100g
52. myristic

In [5]:
from pyspark.sql.functions import col, when, lit, trim  
from functools import reduce

# Construire l'expression de comptage de nulls
null_expr = reduce(
    lambda acc, c: acc + when(col(c).isNull(), 1).otherwise(0),
    df_colonnes_cleaned.columns,
    lit(0)
)

# 4. Filtrer : max nulls ET colonnes clés non nulles et non vides

seuil_nulls = 135  # environ 25 % d'informations valides


df_filtered = df_colonnes_cleaned.withColumn("null_count", null_expr) \
    .filter(
        (col("null_count") <= seuil_nulls) &
        (col("brands").isNotNull()) & (trim(col("brands")) != "") &
        (col("countries").isNotNull()) & (trim(col("countries")) != "") &
        (col("brand_owner").isNotNull()) & (trim(col("brand_owner")) != "")
    ) \
    .drop("null_count")

print(f"✅ Lignes conservées avec au moins {164 - seuil_nulls} colonnes renseignées et les marques/pays: {df_filtered.count():,}")


✅ Lignes conservées avec au moins 29 colonnes renseignées et les marques/pays: 180,041


In [6]:
# 💾 Sauvegarde des données nettoyées pour l'étape suivante (CSV)
import os, shutil

# 1. Prépare le dossier de sortie
folder = os.path.abspath(os.path.join(os.getcwd(), "../data"))
os.makedirs(folder, exist_ok=True)

# 2. Chemin vers le dossier CSV
csv_target = os.path.join(folder, "step2_cleaned_csv")

# 3. Supprime l'ancienne sortie si elle existe
if os.path.exists(csv_target):
    shutil.rmtree(csv_target)
    print("🗑️ Ancienne sortie supprimée :", csv_target)

# 4. Écriture au format CSV avec header
df_filtered \
    .coalesce(1) \
    .write \
    .option("header", "true") \
    .mode("overwrite") \
    .csv(csv_target)

print("✅ Données nettoyées écrites en CSV dans :", csv_target)


🗑️ Ancienne sortie supprimée : /home/jovyan/work/data/step2_cleaned_csv
✅ Données nettoyées écrites en CSV dans : /home/jovyan/work/data/step2_cleaned_csv
