# 🥣 01 - Ingestion des données OpenFoodFacts
Ce notebook a pour objectif de lire les données OpenFoodFacts brutes au format `.csv.gz`, de les analyser rapidement et de les convertir en format `Parquet` pour les étapes suivantes du pipeline.

In [1]:
# ⚙️ Installer pyspark si besoin (ex: sur Google Colab)
try:
    import pyspark
except ImportError:
    %pip install pyspark

In [2]:
# 📦 Imports principaux
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import os

In [3]:
# 🚀 Création de la SparkSession
spark = SparkSession.builder \
    .appName("OpenFoodFacts Ingestion") \
    .getOrCreate()

In [4]:
# 📥 Lecture du fichier .csv.gz (format TSV)
input_path = "../data/en.openfoodfacts.org.products.csv.gz"

df_raw = spark.read.option("header", True) \
                   .option("sep", "\t") \
                   .option("inferSchema", True) \
                   .csv(input_path)

df_raw.cache()
df_raw.printSchema()


root
 |-- code: double (nullable = true)
 |-- url: string (nullable = true)
 |-- creator: string (nullable = true)
 |-- created_t: integer (nullable = true)
 |-- created_datetime: timestamp (nullable = true)
 |-- last_modified_t: integer (nullable = true)
 |-- last_modified_datetime: timestamp (nullable = true)
 |-- last_modified_by: string (nullable = true)
 |-- last_updated_t: integer (nullable = true)
 |-- last_updated_datetime: timestamp (nullable = true)
 |-- product_name: string (nullable = true)
 |-- abbreviated_product_name: string (nullable = true)
 |-- generic_name: string (nullable = true)
 |-- quantity: string (nullable = true)
 |-- packaging: string (nullable = true)
 |-- packaging_tags: string (nullable = true)
 |-- packaging_en: string (nullable = true)
 |-- packaging_text: string (nullable = true)
 |-- brands: string (nullable = true)
 |-- brands_tags: string (nullable = true)
 |-- brands_en: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- catego

In [5]:
# 🔢 Dimensions du DataFrame
n_rows = df_raw.count()
n_cols = len(df_raw.columns)
print(f"Nombre de lignes: {n_rows:,}")
print(f"Nombre de colonnes: {n_cols}")

Nombre de lignes: 3,904,751
Nombre de colonnes: 209


In [6]:
# 📋 Affichage des noms de colonnes
print("\n🧾 Liste des colonnes :")
for i, col_name in enumerate(df_raw.columns, start=1):
    print(f"{i:02d}. {col_name}")



🧾 Liste des colonnes :
01. code
02. url
03. creator
04. created_t
05. created_datetime
06. last_modified_t
07. last_modified_datetime
08. last_modified_by
09. last_updated_t
10. last_updated_datetime
11. product_name
12. abbreviated_product_name
13. generic_name
14. quantity
15. packaging
16. packaging_tags
17. packaging_en
18. packaging_text
19. brands
20. brands_tags
21. brands_en
22. categories
23. categories_tags
24. categories_en
25. origins
26. origins_tags
27. origins_en
28. manufacturing_places
29. manufacturing_places_tags
30. labels
31. labels_tags
32. labels_en
33. emb_codes
34. emb_codes_tags
35. first_packaging_code_geo
36. cities
37. cities_tags
38. purchase_places
39. stores
40. countries
41. countries_tags
42. countries_en
43. ingredients_text
44. ingredients_tags
45. ingredients_analysis_tags
46. allergens
47. allergens_en
48. traces
49. traces_tags
50. traces_en
51. serving_size
52. serving_quantity
53. no_nutrition_data
54. additives_n
55. additives
56. additives_ta

In [7]:
# 💾 Sauvegarde des données ingérées au format CSV
import os

# Chemin absolu vers le dossier de sortie CSV
output_dir = os.path.abspath(os.path.join(os.getcwd(), "../data/step1_raw_csv"))
os.makedirs(output_dir, exist_ok=True)

# Écriture au format CSV (avec header)
df_raw.write \
.option("header", "true") \
.option("sep", ";")\
.mode("overwrite") \
.csv(output_dir)

print(f"✅ Ingestion CSV terminée dans : {output_dir}")


✅ Ingestion CSV terminée dans : /home/jovyan/work/data/step1_raw_csv
