# Nettoyage des données

## 0. Set-up

### Mise en place de l'environnement

In [29]:
import pandas as pd
import numpy as np

import sqlalchemy
import mysql.connector

import sys
sys.path.insert(0, "/home/apprenant/simplon_project/food_facts")


#On peut ensuite importer le dossier comme un module

from src.d00_utils.mysql_utils import mysql_connect, save_to_mysql
connect = mysql_connect()

In [30]:
food_info = pd.read_csv("/home/apprenant/simplon_project/food_facts/data/foodfacts.tsv", sep= '\t' , low_memory= False)

In [5]:
# on limite les données à 10000 lignes elle sont choisit aléatoirement 
df = food_info.sample(10000) 


In [31]:
df.columns.values

array(['code', 'url', 'creator', 'created_t', 'created_datetime',
       'last_modified_t', 'last_modified_datetime', 'product_name',
       'generic_name', 'quantity', 'packaging', 'packaging_tags',
       'brands', 'brands_tags', 'categories', 'categories_tags',
       'categories_en', 'origins', 'origins_tags', 'manufacturing_places',
       'manufacturing_places_tags', 'labels', 'labels_tags', 'labels_en',
       'emb_codes', 'emb_codes_tags', 'first_packaging_code_geo',
       'cities', 'cities_tags', 'purchase_places', 'stores', 'countries',
       'countries_tags', 'countries_en', 'ingredients_text', 'allergens',
       'allergens_en', 'traces', 'traces_tags', 'traces_en',
       'serving_size', 'no_nutriments', 'additives_n', 'additives',
       'additives_tags', 'additives_en', 'ingredients_from_palm_oil_n',
       'ingredients_from_palm_oil', 'ingredients_from_palm_oil_tags',
       'ingredients_that_may_be_from_palm_oil_n',
       'ingredients_that_may_be_from_palm_oil',
   

## 1. Sélection des colonnes

J'affiche un échantillno de mes données pour analyser la pertinence des différentes colonnes

In [32]:
print(df.head())
print(df.columns)

            code                                                url  \
0  0000000003087  http://world-en.openfoodfacts.org/product/0000...   
1  0000000004530  http://world-en.openfoodfacts.org/product/0000...   
2  0000000004559  http://world-en.openfoodfacts.org/product/0000...   
3  0000000016087  http://world-en.openfoodfacts.org/product/0000...   
4  0000000016094  http://world-en.openfoodfacts.org/product/0000...   

                      creator   created_t      created_datetime  \
0  openfoodfacts-contributors  1474103866  2016-09-17T09:17:46Z   
1             usda-ndb-import  1489069957  2017-03-09T14:32:37Z   
2             usda-ndb-import  1489069957  2017-03-09T14:32:37Z   
3             usda-ndb-import  1489055731  2017-03-09T10:35:31Z   
4             usda-ndb-import  1489055653  2017-03-09T10:34:13Z   

  last_modified_t last_modified_datetime                    product_name  \
0      1474103893   2016-09-17T09:18:13Z              Farine de blé noir   
1      1489069957 

Je choisis uniquement les colonnes qui pourraient m'être utiles pour mon étude


## 2. Valeurs manquantes

In [33]:
df = df[['product_name','countries','ingredients_text','additives_n','ingredients_from_palm_oil_n','ingredients_that_may_be_from_palm_oil_n','nutrition_grade_fr','states','energy_100g','fat_100g','sugars_100g','fiber_100g','proteins_100g','salt_100g','nutrition-score-fr_100g']]

In [34]:
print(df.isnull().sum()) 
print(df.shape)

product_name                                17512
countries                                     275
ingredients_text                            72134
additives_n                                 72160
ingredients_from_palm_oil_n                 72160
ingredients_that_may_be_from_palm_oil_n     72160
nutrition_grade_fr                         101171
states                                         52
energy_100g                                 60660
fat_100g                                    76530
sugars_100g                                 76841
fiber_100g                                 135344
proteins_100g                               61866
salt_100g                                   66288
nutrition-score-fr_100g                    101171
dtype: int64
(356027, 15)


il me manque entre 1700 et 3900 données dans chaque colonnes sauf pour les colonnes contries , states et product_name ou il y a peu ou tres peu de valeurs manquantes

### Traitement de product_name

on supprime les lignes ou la valeur de la colonne product_name est nulle car le but de ses données nettoyer est de les utiliser pour une application donc il faut que le nom du produit apparait

In [36]:
df.drop(columns ="product_name", inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


### Traitement de ingredients_text

In [38]:
print(df.ingredients_text.unique())
print(df.ingredients_text.value_counts())

[nan
 'Bananas, vegetable oil (coconut oil, corn oil and/or palm oil) sugar, natural banana flavor.'
 'Peanuts, wheat flour, sugar, rice flour, tapioca starch, salt, leavening (ammonium bicarbonate, baking soda), soy sauce (water, soybeans, wheat, salt), potato starch.'
 ...
 "thé vert, arôme naturel bergamote avec autres arômes naturels, écorces d'orange douce 3%"
 "viande d'oie, graisse de canard, épices, sel, poivre."
 'Citric acid, maltodextrin, instant tea, aspartame**, caramel color, natural and artificial flavors, contains less than 2% of: magnesium oxide acesulfame potassium, calcium silicate, yellow 5, red 40.']
Carbonated water, natural flavor.                                                                                                                                                                                                                                                                                                                                                  

on ne remplacera pas les valeurs manquantes , on ne peut pas donner la liste des elements du produit

In [40]:
print(df.proteins_100g.unique())
print(df.proteins_100g.value_counts())

[  nan  3.57 17.86 ... 41.2  70.9  21.22]
0.000     56679
7.140      5709
0.500      4795
25.000     4216
10.000     4151
          ...  
41.380        1
0.752         1
0.225         1
20.490        1
49.700        1
Name: proteins_100g, Length: 2633, dtype: int64


on remplace les valeurs 'nan' par zéro dans les colonnes séléctionnées 

In [41]:
list = ['ingredients_from_palm_oil_n','ingredients_that_may_be_from_palm_oil_n','fat_100g','sugars_100g','fiber_100g','proteins_100g','salt_100g','nutrition-score-fr_100g']
def nan_to_value(df, list , v) : 
    """function to replace nan values by another one in several columns"""     
    d = {value:v for value in list}
    df.fillna(d , inplace = True)
    return df
nan_to_value(df, list , 0)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


Unnamed: 0,countries,ingredients_text,additives_n,ingredients_from_palm_oil_n,ingredients_that_may_be_from_palm_oil_n,nutrition_grade_fr,states,energy_100g,fat_100g,sugars_100g,fiber_100g,proteins_100g,salt_100g,nutrition-score-fr_100g
0,en:FR,,,0.0,0.0,,"en:to-be-completed, en:nutrition-facts-to-be-c...",,0.00,0.00,0.0,0.00,0.00000,0.0
1,US,"Bananas, vegetable oil (coconut oil, corn oil ...",0.0,0.0,0.0,d,"en:to-be-completed, en:nutrition-facts-complet...",2243.0,28.57,14.29,3.6,3.57,0.00000,14.0
2,US,"Peanuts, wheat flour, sugar, rice flour, tapio...",0.0,0.0,0.0,b,"en:to-be-completed, en:nutrition-facts-complet...",1941.0,17.86,17.86,7.1,17.86,0.63500,0.0
3,US,"Organic hazelnuts, organic cashews, organic wa...",0.0,0.0,0.0,d,"en:to-be-completed, en:nutrition-facts-complet...",2540.0,57.14,3.57,7.1,17.86,1.22428,12.0
4,US,Organic polenta,0.0,0.0,0.0,,"en:to-be-completed, en:nutrition-facts-complet...",1552.0,1.43,0.00,5.7,8.57,0.00000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
356022,US,"Organic peppermint, organic lemon grass, organ...",0.0,0.0,0.0,b,"en:to-be-completed, en:nutrition-facts-complet...",0.0,0.00,0.00,0.0,0.00,0.00000,0.0
356023,China,,,0.0,0.0,,"en:to-be-completed, en:nutrition-facts-to-be-c...",,0.00,0.00,0.0,0.00,0.00000,0.0
356024,France,,,0.0,0.0,,"en:to-be-completed, en:nutrition-facts-to-be-c...",,0.00,0.00,0.0,0.00,0.00000,0.0
356025,en:FR,,,0.0,0.0,,"en:to-be-completed, en:nutrition-facts-to-be-c...",,0.00,0.00,0.0,0.00,0.00000,0.0


### Conclusion sur les valeurs manquantes 

J'ai gardé une grande partie des données il reste encore des valeurs manquantes mais celle ci peuvent etre remplit , on s'occupera de ca un peu plus tard

## 3. Traitement des dates

In [42]:
print(df.dtypes)


countries                                   object
ingredients_text                            object
additives_n                                float64
additives                                   object
ingredients_from_palm_oil_n                float64
ingredients_that_may_be_from_palm_oil_n    float64
nutrition_grade_fr                          object
states                                      object
energy_100g                                float64
fat_100g                                   float64
sugars_100g                                float64
fiber_100g                                 float64
proteins_100g                              float64
salt_100g                                  float64
nutrition-score-fr_100g                    float64
dtype: object


Aucune de mes colonnes ne correspond à une date, je n'ai donc pas à faire de traitement

## 4. Traitement des doublons 

In [44]:
print(df.duplicated().value_counts())

False    288429
True      67598
dtype: int64


on peut voir qu'il y a des doublons on va les supprimes

In [46]:
df = df.drop_duplicates(keep= 'first')

on relance la ligne pour verifier que les doublons sont supprimés

In [47]:
print(df.duplicated().value_counts())

False    288429
dtype: int64


Les doulons on bien été supprimés

## 5. Consistance des données 

on verifie que les valeurs soit corects 

In [56]:
def inconsistant_value(dataframe , column):
    """function to check if values in data are inconsistant """
    df = dataframe[column]
    for index,value in enumerate(df) :
        if value < 0 or value > 100 :
            df.drop(index)

def nutri_score(dataframe, column) : 
    """function to check if value in column nutrition_score are isconsistant """
    df = dataframe[column]
    for value in df :
        if value < -15 or value > 40 :
            print(value)

def delete_incosistant_value(dataframe , column):
    """ """
    df = dataframe[column]
    for value in df : 
        if value < 0 or value > 100 :
            df.drop()

verifions les valeurs de chaque colonnes 

In [49]:
inconsistant_value(df,'ingredients_from_palm_oil_n')

pas de valeur incoherante dans la colonne ingredients_from_palm_oil_n

In [50]:
inconsistant_value(df,'ingredients_that_may_be_from_palm_oil_n')

pas de valeur incoherante dans la colonne ingredients_that_may_be_from_palm_oil_n

In [59]:
#delete_incosistant_value(food_info,'fat_100g' )
inconsistant_value(df, 'fat_100g')

15666666000.0
200.0
101.0
105.0


pas de valeur incoherante dans la colonne fat_100g

In [52]:
inconsistant_value(df, 'sugars_100g')

-1.2
-0.8
134.0
-3.57
110.71
-6.67
-6.25
166.67
-17.86
-0.1
145.0
104.0
105.0
103.5
103.5
103.5
103.5
103.0
3520.0
100.8


pas de valeur incoherante dans la colonne sugar_100g

In [53]:
inconsistant_value(df, 'fiber_100g')

-6.7
166.7
400.0
250.0
669.0
990.0
666.0
105.0
84818150000.0
175.0
786.0
178.0
999.0
5380.0


dans la colonne fiber_100g on a une valeur incoherente donc on va supprimer la ligne

In [54]:
inconsistant_value(df, 'proteins_100g')

-3.57
-500.0
-800.0
15666666000.0
150.0
305.0
1476.0
430.0


pas de valeur incoherante dans la colonne sugar_100g

In [55]:
nutri_score(df, 'nutrition-score-fr_100g')

pas de valeur incohérente dans la colonne nutrition_score-fr_100g

on sauvegarde les données sur la base de données

In [61]:
save_to_mysql(db_connect=connect,df_to_save=food_info,df_name='df_clean')

MySQLInterfaceError: MySQL server has gone away

In [62]:
df.to_csv('/home/apprenant/simplon_project/food_facts/data/df_clean.csv' , index = False , sep = ',' , encoding = 'utf-8', line_terminator = '\n' )