# Extraction des données ( via le package installé via "pip install pyspark")

# **Prérequis :
### Installer pip install pyspark via la gestion des dépendances de kaggle
### Attendre la fin du chargement du csv
### ???
### Profit !

### Objectif :
# Trier les données afin de récupérer celles qui nous interessent.

In [1]:
!pip install pyspark
import time  
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when
 start_time = time.time()
print("Starting the script...")

# Initialize a SparkSession with reduced logs
spark = SparkSession.builder \
    .appName("OpenFoodFacts Exploration") \
    .config("spark.sql.shuffle.partitions", "8") \
    .config("spark.executor.memory", "2g") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")  # Réduit les logs

print("PySpark initialisé avec succès !")

try:
   # Reducing logs
    file_path_csv = "data/en.openfoodfacts.org.products.csv"
    file_path_parquet = "en.openfoodfacts.org.products.parquet"

    # Load the CSV as a Spark DataFrame
    df = spark.read.csv(file_path_csv, header=True, inferSchema=True, sep="\t")
    print("Fichier CSV chargé.")

    # Save the DataFrame in Parquet format
    df.write.parquet(file_path_parquet, mode="overwrite")
    print("Données sauvegardées au format Parquet.")

    # Load the Parquet file for future analysis
    df_parquet = spark.read.parquet(file_path_parquet)
    print("Fichier Parquet chargé.")

    # Check the schema to validate the conversion
    #df_parquet.printSchema()

finally:
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Execution time: {elapsed_time:.2f} seconds")
    


Collecting pyspark
  Downloading pyspark-3.5.3.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840629 sha256=088da72cb3d04398537023cd5d4a0c237393041219f62404037665761b0f350b
  Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.3
Starting the script...


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/02 13:17:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


PySpark initialisé avec succès !


                                                                                

Fichier CSV chargé.


                                                                                

Données sauvegardées au format Parquet.
Fichier Parquet chargé.
Execution time: 272.73 seconds


# Preliminary Analysis
### 1. Highlight the number of columns, rows, and the list of column names



In [2]:
start_time = time.time()

# printSchema
df_parquet.printSchema()
print(" ")
df_parquet.show(20, truncate=False)
# Print number of columns
print(f"Number of columns: {len(df_parquet.columns)}")
print(" ")

# Print number of rows
print(f"Number of rows: {df_parquet.count()}")
print(" ")

# Calculate and display elapsed time
elapsed_time = time.time() - start_time
print("Loading completed.")
print(f"Total execution time: {elapsed_time:.2f} seconds")


root
 |-- code: double (nullable = true)
 |-- url: string (nullable = true)
 |-- creator: string (nullable = true)
 |-- created_t: integer (nullable = true)
 |-- created_datetime: timestamp (nullable = true)
 |-- last_modified_t: integer (nullable = true)
 |-- last_modified_datetime: timestamp (nullable = true)
 |-- last_modified_by: string (nullable = true)
 |-- last_updated_t: integer (nullable = true)
 |-- last_updated_datetime: timestamp (nullable = true)
 |-- product_name: string (nullable = true)
 |-- abbreviated_product_name: string (nullable = true)
 |-- generic_name: string (nullable = true)
 |-- quantity: string (nullable = true)
 |-- packaging: string (nullable = true)
 |-- packaging_tags: string (nullable = true)
 |-- packaging_en: string (nullable = true)
 |-- packaging_text: string (nullable = true)
 |-- brands: string (nullable = true)
 |-- brands_tags: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- categories_tags: string (nullable = true)
 |-- 

                                                                                

+-----------------+----------------------------------------------------------------------------------------------------------------------+--------------------------+----------+-------------------+---------------+----------------------+----------------+--------------+---------------------+---------------------------------------------------------+------------------------+--------------------------------------------------------------------------------------------------------+---------+-----------------------------------------------------------------+--------------------------------------------------------------------+--------------------------------------------------+--------------+------------------------------+---------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------



Number of rows: 3513425
 
Loading completed.
Total execution time: 3.22 seconds


                                                                                

### .2 Handling Missing Values


In [3]:
from pyspark.sql.functions import col, count, when

start_time = time.time()

# Calculate missing data percentage for each column
total_rows = df_parquet.count() 
missing_data = (
    df_parquet.select([
        (count(when(col(c).isNull() | (col(c) == ""), c)) / total_rows).alias(c)
        for c in df_parquet.columns
    ])
)

# Transform columns into rows (melt operation)
missing_data_melted = missing_data.selectExpr(
    "stack({0}, {1}) as (Column, MissingPercentage)".format(
        len(df_parquet.columns),
        ", ".join([f"'{col}', `{col}`" for col in df_parquet.columns])
    )
).filter(col("MissingPercentage").isNotNull()).orderBy(col("MissingPercentage").desc())

# Identify columns with 100% missing data
columns_to_drop = (
    missing_data_melted.filter(col("MissingPercentage") == 1.0)
    .select("Column")
    .rdd.flatMap(lambda x: x)
    .collect()
)

# Drop columns with 100% missing values
df_cleaned = df_parquet.drop(*columns_to_drop)

# Display the top 10 columns with the highest missing percentages
print("Top 10 columns with the highest missing percentages:")
missing_data_melted.show(10, truncate=False)

# Print dropped columns
print(f"Columns dropped due to 100% missing values: {columns_to_drop}")

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Execution completed in {elapsed_time:.2f} seconds")


                                                                                

Top 10 columns with the highest missing percentages:




+-----------------------+------------------+
|Column                 |MissingPercentage |
+-----------------------+------------------+
|cities                 |1.0               |
|allergens_en           |1.0               |
|additives              |0.9999991461323352|
|nutrition-score-uk_100g|0.9999991461323352|
|elaidic-acid_100g      |0.9999980076421155|
|glycemic-index_100g    |0.9999977230195607|
|chlorophyl_100g        |0.9999974383970057|
|erucic-acid_100g       |0.9999968691518959|
|water-hardness_100g    |0.999996584529341 |
|caproic-acid_100g      |0.9999954460391214|
+-----------------------+------------------+
only showing top 10 rows

Columns dropped due to 100% missing values: ['cities', 'allergens_en']
Execution completed in 116.24 seconds


                                                                                

### 3. Handling Duplicates

In [4]:
from pyspark.sql.functions import col, count

start_time = time.time()

# Analyzing Duplicates in 'code', 'product_name', and 'brands'
duplicates = (
    df_parquet.groupBy("code", "product_name", "brands")
    .count()
    .filter(col("count") > 1)
)

# Affiche le nombre de doublons identifiés
print(f"There are {duplicates.count()} duplicate rows based on 'code', 'product_name', and 'brands'.")
duplicates.show(truncate=False)

# Remove duplicates where 'code', 'product_name', and 'brands' are the same
df_cleaned = df_parquet.dropDuplicates(["code", "product_name", "brands"])

print(f"Number of rows after removing duplicates: {df_cleaned.count()}")

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Execution completed in {elapsed_time:.2f} seconds")


                                                                                

There are 1968 duplicate rows based on 'code', 'product_name', and 'brands'.


                                                                                

+-----------------+------------------------------------------------------------+-----------------------------+-----+
|code             |product_name                                                |brands                       |count|
+-----------------+------------------------------------------------------------+-----------------------------+-----+
|3.245413820389E12|Sauce hollandaise                                           |Carrefour                    |2    |
|3.24541503968E12 |Ile de Beauté Rosé                                          |Réserve de Padulone,Carrefour|2    |
|3.245415141093E12|320G Beignet Chocolat X 4 Blis                              |Carrefour                    |2    |
|3.245677723693E12|Espresso Auchan Bio                                         |Auchan                       |2    |
|3.250390058991E12|Printiligne                                                 |Paturages                    |2    |
|3.250390109808E12|Chips nature                                 



Number of rows after removing duplicates: 3511425
Execution completed in 21.10 seconds


                                                                                

### 4. Handle outliers

In [5]:
from pyspark.sql.functions import regexp_extract

df_parquet = df_parquet.withColumn(
    "quantity_numeric",
    regexp_extract(col("quantity"), r"(\d+)", 1).cast("double")
)

numeric_columns = [
    field.name for field in df_parquet.schema.fields 
    if str(field.dataType) in ["IntegerType", "DoubleType", "FloatType"]
]
print(f"Numeric columns detected: {numeric_columns}")

if not numeric_columns:
    print("No numeric columns found. Please check your data.")
else:
    # Boucle sur les colonnes numériques pour détecter les outliers
    for column in numeric_columns:
        try:
            quantiles = df_parquet.approxQuantile(column, [0.25, 0.75], 0.1)
            if len(quantiles) < 2:
                print(f"Column '{column}' has insufficient data. Skipping...")
                continue
            
            q1, q3 = quantiles
            iqr = q3 - q1
            lower_bound = q1 - 1.5 * iqr
            upper_bound = q3 + 1.5 * iqr

            print(f"Column: {column}")
            print(f"Q1: {q1}, Q3: {q3}, IQR: {iqr}")
            print(f"Lower Bound: {lower_bound}, Upper Bound: {upper_bound}")
            
            outliers = df_parquet.filter((col(column) < lower_bound) | (col(column) > upper_bound))
            print(f"Outliers detected in '{column}': {outliers.count()}")

        except Exception as e:
            print(f"Error processing column '{column}': {e}")


Numeric columns detected: []
No numeric columns found. Please check your data.


# Data cleaning


In [6]:
df_parquet.describe()


In [None]:
selected_column = [
    'code',
    'product_name',
    'brands',
    'categories',
    "main_category",
    'quantity',
    'packaging',
    'countries',
    'ingredients_text',
    'allergens',
    'serving_size',
    'energy-kcal_100g',
    "energy-from-fat_100g",
    'fat_100g',
    'saturated-fat_100g',
    "proteins_100g",
    'sugars_100g',
    'salt_100g',
    'nutriscore_score',
    'nutriscore_grade',
    "food_groups_en",
]

df_transformed = df_parquet.select(selected_column)
df_transformed.show(5, truncate=False)

In [None]:
# convertir les donnés string en integer
column_to_convert = ["quantity", "nutriscore_score", "energy-kcal_100g", "energy-from-fat_100g",
                     "fat_100g", "saturated-fat_100g", "proteins_100g", "sugars_100g", "salt_100g"]
# apply the conversion
for column in column_to_convert:
    df_transformed = df_transformed.withColumn(column, col(column).cast("double"))
df_transformed.printSchema()

In [None]:
# convert code in string
df_transformed = df_transformed.withColumn("code", col("code").cast("string"))

In [None]:
df_transformed.show(5, truncate=False)

# Transformation des données Transform :
Ajouter des colonnes calculées, par exemple : Indice de qualité nutritionnelle 
Calculer un score basé sur les nutriments (e.g., sodium, sugar, fiber). 
Extraire la catégorie principale d'un produit (e.g., "boissons", "snacks"). 
Regrouper les données par catégories (categories) pour analyser les tendances (e.g., moyenne des calories par catégorie).

--> Quel calcules effectuer ?  
--> Quel catégories créer ?


In [7]:
print("Transformation")

Transformation


# Analyse exploratoire :
Utiliser des fonctions de calcul sur fenêtre pour : 
Trouver les produits les plus caloriques par catégorie. 
Identifier les tendances de production par brands (marques). 
Générer des statistiques descriptives (e.g., médiane, moyenne des nutriments par catégorie

In [8]:
print("Exploration")

Exploration


# Sauvegarde des données Save :
Partitionner les données par catégories (categories) et années (year). 
Sauvegarder les résultats transformés en format Parquet avec compression Snappy. 
Sauvegarder les résultats transformés dans les bases de données: postgresql/sqlserver/mysql/Snowflake/BigQuery

In [9]:
print("Sauvegarde des données (load)")

Sauvegarde des données (load)




# Présentation des résultats :
Visualiser les résultats sous forme de graphiques ou tableaux 
(les étudiants peuvent utiliser un outil comme Jupyter Notebook en local ou Google Colab 

In [10]:
print("Présentation des données")

Présentation des données
