# Nettoyage des données

In [70]:
#fonction qui import toutes les librairies nécessaires 

def import_librairies():
    import pandas as pd
    import numpy as np
    import seaborn as sns
    import matplotlib.pyplot as plt
    import plotly.express as px

In [71]:
#fonction qui importe un fichier csv et renvoie un dataframe

def importer_csv(nom_fichier, chemin):
    chemin_complet = chemin  + nom_fichier 
    dataframe = pd.read_csv(chemin_complet, sep='\t')
    return dataframe

In [72]:
#fonction qui supprime les colonnes vides et des lignes doublons

def supprimer_col_vides_doublons(df):
    df.dropna(axis=1, how='all', inplace=True)
    df.drop_duplicates()
    
    return df

In [73]:
def suppression_col_correlees(df, seuil_correlation):
    #création d'un dataframe quantitatif
    df_quantitatif = pd.DataFrame()

    for colonne in df.columns:
        if pd.api.types.is_numeric_dtype(df[colonne]):
            df_quantitatif[colonne] = df[colonne]
            
    # Calculer la matrice de corrélation
    matrice_correlation = df_quantitatif.corr()

    # Identifier les paires de colonnes corrélées
    colonnes_correlees = set()
    for i in range(len(matrice_correlation.columns)):
        for j in range(i):
            if abs(matrice_correlation.iloc[i, j]) >= seuil_correlation:
                colonne_i = matrice_correlation.columns[i]
                colonne_j = matrice_correlation.columns[j]
                colonnes_correlees.add((colonne_i, colonne_j))

     # Supprimer la première colonne de chaque paire corrélée
    for colonne1, colonne2 in colonnes_correlees:
        df.drop(columns=colonne1, inplace=True)
    
    return df

In [74]:
#fonction qui ne garde que les colonnes pertinentes

def selection_colonnes(df):
    #creation de la liste des variables, on y insérant des variables utiles vis à vis de l'analyse
    variables = ['product_name', 'brands','pnns_group_1', 'pnns_group_2', 'main_category_fr', 'nutrition_grade_fr']
    
    #liste de toutes les colonnes en rapport avec les valeurs nutritionnelles
    val_nutri = [col for col in df.columns if col.endswith('_100g')]
    
    #suppression de l'empreinte carbone qui n'affecte pas la valeur nutritionelle, ainsi que le nutriscore UK
    try:
        val_nutri.remove('carbon-footprint_100g')
    except KeyError:
        print('colonne inexistente')
    try:
        val_nutri.remove('nutrition-score-uk_100g')
    except KeyError:
        print('colonne inexistente')
        
    variables = variables + val_nutri
    
    #ne garder que ces variables dans le dataframe
    df = df[variables]
    
    #liste des colonnes avec +60% de valeurs nulles
    colonnes_a_supprimer = df.columns[round(df.isna().mean()*100,1)> 60]
    
    #suppression de ces colonnes vides ou presque
    df = df.drop(columns=colonnes_a_supprimer)
    
    #suppression d'une colonne si +0.85 de corrélation avec une autre
    df = suppression_col_correlees(df, 0.85)
    
    return df



In [76]:
def remplacer_na_100g_mediane(df, liste_colonnes):
    
    myDict =  {i: df[liste_colonnes][df['main_category_fr'] == i].mean() for i in df['main_category_fr'].unique()}
    
    for i in df4['main_category_fr'].unique():
        df.loc[df['main_category_fr']== i, liste_colonnes] = 
        df[df['main_category_fr']== i ][liste_colonnes].fillna(df[df['main_category_fr'] == i][liste_colonnes].median()) 
    
    return df

In [77]:
def gerer_valeurs_manquantes(df):
    
    #suppression des lignes où le nom du produit n'est pas renseigné
    df = df.dropna(subset=['product_name'])
    
    col_100g = [col for col in df.columns if col.endswith('_100g')]
    
    #remplacer des Nan par le mode de la colonne
    df = remplacer_na_mode(df, numeriques_moins_50)
    
    #remplacer des Nan par la médiane de la colonne
    df = remplacer_na_mediane(df, numeriques_plus_50)
    
    return df

In [78]:
def valeurs_aberrantes(df):
    df = df[(df['nutrition-score-fr_100g'] >= -15) & (df['nutrition-score-fr_100g'] <= 40)]
    
    df = df[(df['energy_100g'] < 4000)]
    
    # Sélectionner les colonnes finissant par "100g" sauf "energy_100g"
    colonnes_a_selectionner = [col for col in df.columns if col.endswith('100g') and col != 'energy_100g' 
                               and col != 'nutrition-score-fr_100g']
    
    df_val_nutri = df[colonnes_a_selectionner]
    
    mask = ((df_val_nutri > 100) | (df_val_nutri < 0)).any(axis=1)
    
    df.drop(index=df[mask].index, inplace=True)
    return df

In [82]:
#défnition variables

chemin = ''
fichier = 'fr.openfoodfacts.org.products.csv'
nom_csv_export = 'sante_publique.csv'

In [83]:
def main():
    import_librairies()
    df = importer_csv(fichier, chemin)
    df = supprimer_col_vides_doublons(df)
    df = selection_colonnes(df)
    df = gerer_valeurs_manquantes(df)
    df = valeurs_aberrantes(df)
    print(df.shape)
    
    # Exporter le DataFrame dans un fichier CSV
    df.to_csv(chemin + nom_csv_export, index=False)

## Main

In [84]:
main()


Columns (0,3,5,19,20,24,25,26,27,28,35,36,37,38,39,48) have mixed types. Specify dtype option on import or set low_memory=False.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



(302648, 16)


# Analyse

In [17]:
import_librairies()

In [18]:
data = pd.read_csv('sante_publique.csv', sep=",")

In [19]:
data.head()

Unnamed: 0,product_name,energy_100g,fat_100g,saturated-fat_100g,trans-fat_100g,cholesterol_100g,carbohydrates_100g,sugars_100g,fiber_100g,proteins_100g,salt_100g,vitamin-a_100g,vitamin-c_100g,calcium_100g,iron_100g,nutrition-score-fr_100g
0,Farine de blé noir,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.035,0.00101,0.0
1,Banana Chips Sweetened (Whole),2243.0,28.57,28.57,0.0,0.018,64.29,14.29,3.6,3.57,0.0,0.0,0.0214,0.0,0.00129,14.0
2,Peanuts,1941.0,17.86,0.0,0.0,0.0,60.71,17.86,7.1,17.86,0.635,0.0,0.0,0.071,0.00129,0.0
3,Organic Salted Nut Mix,2540.0,57.14,5.36,0.0,0.0,17.86,3.57,7.1,17.86,1.22428,0.0,0.0,0.143,0.00514,12.0
4,Organic Polenta,1552.0,1.43,0.0,0.0,0.0,77.14,0.0,5.7,8.57,0.0,0.0,0.0,0.035,0.00101,0.0


In [20]:
data.shape

(302648, 16)

In [None]:
myDict =  {i: df4[col_100g][df4['main_category_fr'] == i].mean() for i in df4['main_category_fr'].unique()}

In [None]:
for i in df4['main_category_fr'].unique():
    
    df4.loc[df4['main_category_fr']== i, col_100g] = df4[df4['main_category_fr']== i ][col_100g].fillna(df4[df4['main_category_fr'] == i][col_100g].mean()) 