Pour une lecture améliorée, ce notebook est idéalement lu avec un outil qui gère les niveaux des titres Markdown.  
Par exemple avec l'extension 'Table of Contents' du module 'Nbextensions' de Jupyter.

# Mise en place du notebook

## Import des modules

In [170]:
import numpy as np
import pandas as pd
from os.path import exists

# API openfoodfacts
try:
    import openfoodfacts
except:
    !pip install openfoodfacts
    import openfoodfacts

## Gestion des paramètres d'affichage de Jupyter

In [171]:
# pour gérer l'affichage des dataframes pandas
pd.options.display.max_rows = 100
pd.options.display.max_columns = 200

# pour gérer l'affichage des arrays numpy
# 230 sur ma config bureau, 165 sur ma config portable
np.set_printoptions(linewidth=165)

# pour afficher les gros fichiers sur toute la largeur de l'écran
from IPython.core.display import HTML
HTML("<style>.container { width:97% }</style>")

# Prise en main de l'API openfoodfacts

## Tests de l'API

Usage guide :
https://github.com/openfoodfacts/openfoodfacts-python/blob/master/docs/Usage.md

Les facets sont retournés sous forme de listes de dictionnaires.

Les variables sont définies à :
https://world.openfoodfacts.org/data/data-fields.txt

### Requête d'une facette : retourne les modalités d'une variable

In [172]:
additives = openfoodfacts.facets.get_additives()
print(len(additives))
additives[0:5]

564


[{'id': 'en:e330',
  'known': 1,
  'name': 'E330 - Citric acid',
  'products': 131642,
  'sameAs': ['https://www.wikidata.org/wiki/Q159683'],
  'url': 'https://world.openfoodfacts.org/additive/e330-citric-acid'},
 {'id': 'en:e322',
  'known': 1,
  'name': 'E322 - Lecithins',
  'products': 89093,
  'sameAs': ['https://www.wikidata.org/wiki/Q241595'],
  'url': 'https://world.openfoodfacts.org/additive/e322-lecithins'},
 {'id': 'en:e322i',
  'known': 1,
  'name': 'E322i - Lecithin',
  'products': 81397,
  'sameAs': ['https://www.wikidata.org/wiki/Q241595'],
  'url': 'https://world.openfoodfacts.org/additive/e322i-lecithin'},
 {'id': 'en:e500',
  'known': 1,
  'name': 'E500 - Sodium carbonates',
  'products': 55971,
  'sameAs': ['https://www.wikidata.org/wiki/Q190227'],
  'url': 'https://world.openfoodfacts.org/additive/e500-sodium-carbonates'},
 {'id': 'en:e415',
  'known': 1,
  'name': 'E415 - Xanthan gum',
  'products': 49923,
  'sameAs': ['https://www.wikidata.org/wiki/Q410768'],
  'ur

In [173]:
countries = openfoodfacts.facets.get_countries()
print(len(countries))

642


In [174]:
# openfoodfacts.facets.get_ingredients()   # erreur de l'API

In [175]:
languages = openfoodfacts.facets.get_languages()
print(len(languages))

138


In [176]:
places = openfoodfacts.facets.get_purchase_places()
print(len(places))

4806


In [177]:
stores = openfoodfacts.facets.get_stores()
print(len(stores))

6495


In [178]:
traces = openfoodfacts.facets.get_traces()
print(len(traces))

7772


### Requête de produits répondant à une seule modalité de facette

In [179]:
products = openfoodfacts.products.get_by_trace('egg')
print(len(products))
products[0:5]

24


[{'_id': '3579260039398',
  '_keywords': ['charcuterie',
   'pur',
   'auvernou',
   'made',
   'stick',
   'porcine',
   'in',
   'saucisson',
   'sec',
   'nature',
   'viande',
   'pork',
   'francaise',
   'dot',
   'france',
   'pure',
   'porc',
   'mini',
   'green'],
  'added_countries_tags': [],
  'additives_debug_tags': [],
  'additives_n': 4,
  'additives_old_n': 3,
  'additives_old_tags': ['en:e252', 'en:e250', 'en:e460'],
  'additives_original_tags': ['en:e252', 'en:e250', 'en:e422', 'en:e460'],
  'additives_prev_original_tags': ['en:e252', 'en:e250', 'en:e422', 'en:e460'],
  'additives_tags': ['en:e250', 'en:e252', 'en:e422', 'en:e460'],
  'additives_tags_n': None,
  'allergens': 'en:milk',
  'allergens_from_ingredients': 'lactose',
  'allergens_from_user': '(en) en:milk',
  'allergens_hierarchy': ['en:milk'],
  'allergens_lc': 'en',
  'allergens_tags': ['en:milk'],
  'amino_acids_prev_tags': [],
  'amino_acids_tags': [],
  'brands': 'Auvernou',
  'brands_tags': ['auverno

In [180]:
products = openfoodfacts.products.get_by_country('France')
print(len(products))
products[0:5]

24


[{'_id': '5470188071102',
  '_keywords': ['alvalle', 'gaspacho'],
  'added_countries_tags': [],
  'allergens': '',
  'allergens_from_ingredients': '',
  'allergens_from_user': '(fr) ',
  'allergens_hierarchy': [],
  'allergens_tags': [],
  'brands': 'Alvalle',
  'brands_tags': ['alvalle'],
  'categories_properties': {},
  'categories_properties_tags': ['all-products',
   'categories-unknown',
   'agribalyse-food-code-unknown',
   'agribalyse-proxy-food-code-unknown',
   'ciqual-food-code-unknown',
   'agribalyse-unknown'],
  'checkers_tags': [],
  'code': '5470188071102',
  'codes_tags': ['code-13',
   '5470188071xxx',
   '547018807xxxx',
   '54701880xxxxx',
   '5470188xxxxxx',
   '547018xxxxxxx',
   '54701xxxxxxxx',
   '5470xxxxxxxxx',
   '547xxxxxxxxxx',
   '54xxxxxxxxxxx',
   '5xxxxxxxxxxxx'],
  'complete': 0,
  'completeness': 0.2,
  'correctors_tags': [],
  'countries': 'en:france',
  'countries_hierarchy': ['en:france'],
  'countries_tags': ['en:france'],
  'created_t': 162245421

In [181]:
ingredient = 'phosphate de calcium'
products_ingredient = openfoodfacts.products.get_by_ingredient(ingredient)
for product in products_ingredient:
    print (product['product_name'])

nature's bounty women's multivitamin
Velveeta
Mini Fruits
A  Toddler Tout-petits
Almond Dream Almond Milk



### Requête de produits répondant à plusieurs modalités

get_by_facets() : résultat très décevant. La requête ne retourne que 24 produits.

In [182]:
# liste des produits contenant des traces d'oeuf, et country=France
products = openfoodfacts.products.get_by_facets(
#     {'trace': 'egg', 'country': 'france'})
    {'trace': 'egg'})
print(len(products))
#products   # products est une liste de dictionnaires

24


In [183]:
products[1]

{'_id': '00726511300821001676',
 '_keywords': ['punto',
  'comida',
  'verde',
  'de',
  'bonarea',
  'crema',
  'espinaca',
  'preparada',
  'origen',
  'la',
  'vegetal'],
 'added_countries_tags': [],
 'additives_n': 0,
 'additives_old_n': 0,
 'additives_old_tags': [],
 'additives_original_tags': [],
 'additives_tags': [],
 'allergens': 'en:milk,en:nuts,es:Nata',
 'allergens_from_ingredients': 'nata, leche, piñones',
 'allergens_from_user': '(es) Leche,Frutos de cáscara,Nata',
 'allergens_hierarchy': ['en:milk', 'en:nuts', 'es:Nata', 'es:nata'],
 'allergens_lc': 'es',
 'allergens_tags': ['en:milk', 'en:nuts', 'es:nata', 'es:nata'],
 'amino_acids_tags': [],
 'brands': 'Bonarea',
 'brands_tags': ['bonarea'],
 'categories': 'Comidas preparadas de origen vegetal',
 'categories_hierarchy': ['en:plant-based-foods-and-beverages',
  'en:plant-based-foods',
  'en:meals',
  'en:plant-based-meals'],
 'categories_lc': 'es',
 'categories_properties': {},
 'categories_properties_tags': ['all-produ

In [184]:
for product in products:
    print(product['product_name'])

Mini sticks nature auvernou
Espinacas a la crema
Nos œufs bio
Haché de Jambon - à l'Emmental
Filet de saumon & pâtes au basilic, sauce citron
Le Moelleux
Le Kebab de Poulet et sa Semoule à l'Orientale
Le Moelleux
Haché de Jambon - à Pôeler
Le surimi râpé
Penne complètes & légumes du soleil
Le Gratin Butternut Pâtes & Poulet et sa sauce crème & emmental
Le Couscous à la Marocaine et ses légumes
Le Délice à la chair de crabe
Le Moelleux
Petites Gambas & tagliatelles au basilic
Sopa De Pollo Y Fideos 70 GR
Homesoy soya milk
Nissin Lámen Galinha Caipira
Panés au colin d'Alaska
Tavoletta cioccolato rosa e melograno
Tavoletta CIOCCOLATO FONDENTE 72%
Cioccolato Fondente con frutti di bosco
Boudins noirs aux Pommes


In [185]:
products[0].keys()



In [186]:
type(products)

list

### Requête d'un produit en particulier via le barcode

In [187]:
barcode = '3036810201280'
product = openfoodfacts.products.get_product(barcode)
print(product['product']['product_name'])
print("keys du dictionnaire :", list(product.keys()))
product

Dijon Originale
keys du dictionnaire : ['code', 'product', 'status', 'status_verbose']


{'code': '3036810201280',
 'product': {'_id': '3036810201280',
  '_keywords': ['lebensmittel',
   'gewurzmittel',
   'dijon-senfe',
   'unspecified',
   'dijon-senf',
   'saucen',
   'originale',
   'dijon',
   'senfe',
   'maille'],
  'added_countries_tags': [],
  'additives_debug_tags': [],
  'additives_n': 2,
  'additives_old_n': 1,
  'additives_old_tags': ['en:e330'],
  'additives_original_tags': ['en:e330', 'en:e224'],
  'additives_prev_original_tags': ['en:e330', 'en:e224'],
  'additives_tags': ['en:e224', 'en:e330'],
  'allergens': 'en:mustard,de:Kaliummetabisulfit',
  'allergens_from_ingredients': '',
  'allergens_from_user': '(en) en:mustard,de:Kaliummetabisulfit',
  'allergens_hierarchy': ['en:mustard', 'de:Kaliummetabisulfit'],
  'allergens_lc': 'en',
  'allergens_tags': ['en:mustard', 'de:kaliummetabisulfit'],
  'amino_acids_prev_tags': [],
  'amino_acids_tags': [],
  'brands': 'Maille',
  'brands_tags': ['maille'],
  'categories': 'Lebensmittel, Gewürzmittel, Saucen, Senfe

### Search

La méthode search_all() est préférable à search(). La seconde méthode retourne des méta-données en plus de l'itérable.

##### Méthode search()

In [188]:
query = 'mustard maille'
search_result = openfoodfacts.products.search(query)
# search_result

##### Méthode search_all()

In [189]:
query = 'mustard maille'
search_result = openfoodfacts.products.search_all(query)
for product in search_result:
    #     print (product['product_name'])
    #     display(product)
    pass

### Download data

In [190]:
file_type = 'csv'
# openfoodfacts.utils.download_data(file_type)

L'instruction download_data() télécharge le data set dans le dossier ~/user.

## Conclusion des tests de l'API

Les résultats sont décevants. Les requêtes retournent des données incomplètes.  
La fonction download_data() est utile car elle permet de récupérer le dataset.

# Téléchargement des données openfoodfacts

## Téléchargement du fichier .csv contenant le jeu de données openfoodfatcs

In [191]:
download = False    # False pour utiliser le fichier déjà téléchargé    # si True : on relance un download complet
          
# si le fichier csv n'existe pas, un download est demandé
if not exists('en.openfoodfacts.org.products.csv') :
    download = True
          
if download :
    # téléchargement du .csv
    file_type = 'csv'
    openfoodfacts.utils.download_data(file_type)     # L'instruction download_data() télécharge le dataset dans le dossier ~/user.

## Conversion du fichier .csv en dataframe pandas

In [192]:
# Chargement du dataframe df1 à partir du fichier csv ou à partir d'un pickle pour gagner du temps
create_pickle = False  # si False : appel du pickle existant    # si True : création du pickle

# si le fichier pickle n'existe pas, une création est demandée
if not exists('df_avant_nettoyage.pkl'):
    create_pickle = True

if create_pickle:
    # création du pickle
    df1 = pd.read_csv('en.openfoodfacts.org.products.csv',
                      sep='\t',
                      low_memory=False)
    df1.to_pickle('df_avant_nettoyage.pkl')
else:  # on charge le pickle
    df1 = pd.read_pickle('df_avant_nettoyage.pkl')

## Affichage de quelques lignes du dataframe

In [193]:
with pd.option_context('display.max_rows', 400, 'display.max_colwidth', 1000):
    display(df1.tail())
#     display(df1.loc[[4]])
#     display(df1.loc[4:5000])

Unnamed: 0,code,url,creator,created_t,created_datetime,last_modified_t,last_modified_datetime,product_name,abbreviated_product_name,generic_name,quantity,packaging,packaging_tags,packaging_text,brands,brands_tags,categories,categories_tags,categories_en,origins,origins_tags,origins_en,manufacturing_places,manufacturing_places_tags,labels,labels_tags,labels_en,emb_codes,emb_codes_tags,first_packaging_code_geo,cities,cities_tags,purchase_places,stores,countries,countries_tags,countries_en,ingredients_text,allergens,allergens_en,traces,traces_tags,traces_en,serving_size,serving_quantity,no_nutriments,additives_n,additives,additives_tags,additives_en,ingredients_from_palm_oil_n,ingredients_from_palm_oil,ingredients_from_palm_oil_tags,ingredients_that_may_be_from_palm_oil_n,ingredients_that_may_be_from_palm_oil,ingredients_that_may_be_from_palm_oil_tags,nutriscore_score,nutriscore_grade,nova_group,pnns_groups_1,pnns_groups_2,states,states_tags,states_en,brand_owner,ecoscore_score_fr,ecoscore_grade_fr,main_category,main_category_en,image_url,image_small_url,image_ingredients_url,image_ingredients_small_url,image_nutrition_url,image_nutrition_small_url,energy-kj_100g,energy-kcal_100g,energy_100g,energy-from-fat_100g,fat_100g,saturated-fat_100g,-butyric-acid_100g,-caproic-acid_100g,-caprylic-acid_100g,-capric-acid_100g,-lauric-acid_100g,-myristic-acid_100g,-palmitic-acid_100g,-stearic-acid_100g,-arachidic-acid_100g,-behenic-acid_100g,-lignoceric-acid_100g,-cerotic-acid_100g,-montanic-acid_100g,-melissic-acid_100g,monounsaturated-fat_100g,polyunsaturated-fat_100g,omega-3-fat_100g,-alpha-linolenic-acid_100g,-eicosapentaenoic-acid_100g,-docosahexaenoic-acid_100g,omega-6-fat_100g,-linoleic-acid_100g,-arachidonic-acid_100g,-gamma-linolenic-acid_100g,-dihomo-gamma-linolenic-acid_100g,omega-9-fat_100g,-oleic-acid_100g,-elaidic-acid_100g,-gondoic-acid_100g,-mead-acid_100g,-erucic-acid_100g,-nervonic-acid_100g,trans-fat_100g,cholesterol_100g,carbohydrates_100g,sugars_100g,-sucrose_100g,-glucose_100g,-fructose_100g,-lactose_100g,-maltose_100g,-maltodextrins_100g,starch_100g,polyols_100g,fiber_100g,-soluble-fiber_100g,-insoluble-fiber_100g,proteins_100g,casein_100g,serum-proteins_100g,nucleotides_100g,salt_100g,sodium_100g,alcohol_100g,vitamin-a_100g,beta-carotene_100g,vitamin-d_100g,vitamin-e_100g,vitamin-k_100g,vitamin-c_100g,vitamin-b1_100g,vitamin-b2_100g,vitamin-pp_100g,vitamin-b6_100g,vitamin-b9_100g,folates_100g,vitamin-b12_100g,biotin_100g,pantothenic-acid_100g,silica_100g,bicarbonate_100g,potassium_100g,chloride_100g,calcium_100g,phosphorus_100g,iron_100g,magnesium_100g,zinc_100g,copper_100g,manganese_100g,fluoride_100g,selenium_100g,chromium_100g,molybdenum_100g,iodine_100g,caffeine_100g,taurine_100g,ph_100g,fruits-vegetables-nuts_100g,fruits-vegetables-nuts-dried_100g,fruits-vegetables-nuts-estimate_100g,collagen-meat-protein-ratio_100g,cocoa_100g,chlorophyl_100g,carbon-footprint_100g,carbon-footprint-from-meat-or-fish_100g,nutrition-score-fr_100g,nutrition-score-uk_100g,glycemic-index_100g,water-hardness_100g,choline_100g,phylloquinone_100g,beta-glucan_100g,inositol_100g,carnitine_100g
1802746,9999999910128,http://world-en.openfoodfacts.org/product/9999999910128/sandwich-club-rillette-poisson-combava,kiliweb,1572513866,2019-10-31T09:24:26Z,1572513866,2019-10-31T09:24:26Z,Sandwich club Rillette poisson combava,,,,,,,,,,,,,,,,,,,,,,,,,,,en:re,en:reunion,Réunion,,,,,,,,,,,,,,,,,,,,,,,unknown,unknown,"en:to-be-completed, en:nutrition-facts-to-be-completed, en:ingredients-to-be-completed, en:expiration-date-to-be-completed, en:packaging-code-to-be-completed, en:characteristics-to-be-completed, en:categories-to-be-completed, en:brands-to-be-completed, en:packaging-to-be-completed, en:quantity-to-be-completed, en:product-name-completed, en:photos-to-be-validated, en:photos-uploaded","en:to-be-completed,en:nutrition-facts-to-be-completed,en:ingredients-to-be-completed,en:expiration-date-to-be-completed,en:packaging-code-to-be-completed,en:characteristics-to-be-completed,en:categories-to-be-completed,en:brands-to-be-completed,en:packaging-to-be-completed,en:quantity-to-be-completed,en:product-name-completed,en:photos-to-be-validated,en:photos-uploaded","To be completed,Nutrition facts to be completed,Ingredients to be completed,Expiration date to be completed,Packaging code to be completed,Characteristics to be completed,Categories to be completed,Brands to be completed,Packaging to be completed,Quantity to be completed,Product name completed,Photos to be validated,Photos uploaded",,,,,,https://static.openfoodfacts.org/images/products/999/999/991/0128/front_fr.3.400.jpg,https://static.openfoodfacts.org/images/products/999/999/991/0128/front_fr.3.200.jpg,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1802747,999999999,http://world-en.openfoodfacts.org/product/999999999/the-noir-bio-darjeeling-pages,nutrinet-sante,1608105503,2020-12-16T07:58:23Z,1608105504,2020-12-16T07:58:24Z,Thé noir BIO Darjeeling,,,,,,,Pagès,pages,,,,,,,,,,,,,,,,,,,en:france,en:france,France,,,,,,,,,,,,,,,,,,,,,,,unknown,unknown,"en:to-be-completed, en:nutrition-facts-to-be-completed, en:ingredients-to-be-completed, en:expiration-date-to-be-completed, en:packaging-code-to-be-completed, en:characteristics-to-be-completed, en:categories-to-be-completed, en:brands-completed, en:packaging-to-be-completed, en:quantity-to-be-completed, en:product-name-completed, en:photos-to-be-uploaded","en:to-be-completed,en:nutrition-facts-to-be-completed,en:ingredients-to-be-completed,en:expiration-date-to-be-completed,en:packaging-code-to-be-completed,en:characteristics-to-be-completed,en:categories-to-be-completed,en:brands-completed,en:packaging-to-be-completed,en:quantity-to-be-completed,en:product-name-completed,en:photos-to-be-uploaded","To be completed,Nutrition facts to be completed,Ingredients to be completed,Expiration date to be completed,Packaging code to be completed,Characteristics to be completed,Categories to be completed,Brands completed,Packaging to be completed,Quantity to be completed,Product name completed,Photos to be uploaded",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1802748,9999999990397,http://world-en.openfoodfacts.org/product/9999999990397/fati,kiliweb,1581171613,2020-02-08T14:20:13Z,1582644247,2020-02-25T15:24:07Z,Fati,,,440 g,,,,,,,,,,,,,,,,,,,,,,,,en:be,en:belgium,Belgium,,,,,,,,,,,,,,,,,,,,,,,unknown,unknown,"en:to-be-completed, en:nutrition-facts-completed, en:ingredients-to-be-completed, en:expiration-date-to-be-completed, en:packaging-code-to-be-completed, en:characteristics-to-be-completed, en:categories-to-be-completed, en:brands-to-be-completed, en:packaging-to-be-completed, en:quantity-completed, en:product-name-completed, en:photos-to-be-validated, en:photos-uploaded","en:to-be-completed,en:nutrition-facts-completed,en:ingredients-to-be-completed,en:expiration-date-to-be-completed,en:packaging-code-to-be-completed,en:characteristics-to-be-completed,en:categories-to-be-completed,en:brands-to-be-completed,en:packaging-to-be-completed,en:quantity-completed,en:product-name-completed,en:photos-to-be-validated,en:photos-uploaded","To be completed,Nutrition facts completed,Ingredients to be completed,Expiration date to be completed,Packaging code to be completed,Characteristics to be completed,Categories to be completed,Brands to be completed,Packaging to be completed,Quantity completed,Product name completed,Photos to be validated,Photos uploaded",,,,,,https://static.openfoodfacts.org/images/products/999/999/999/0397/front_fr.3.400.jpg,https://static.openfoodfacts.org/images/products/999/999/999/0397/front_fr.3.200.jpg,,,https://static.openfoodfacts.org/images/products/999/999/999/0397/nutrition_fr.5.400.jpg,https://static.openfoodfacts.org/images/products/999/999/999/0397/nutrition_fr.5.200.jpg,,24.0,100.0,,0.3,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.4,0.6,,,,,,,,,,,,1.6,,,,0.64,0.256,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1802749,9999999999994,http://world-en.openfoodfacts.org/product/9999999999994/light-free-skyr-a-boire,kiliweb,1613129728,2021-02-12T11:35:28Z,1613129730,2021-02-12T11:35:30Z,Light & Free SKYR A BOIRE,,,,,,,,,,,,,,,,,,,,,,,,,,,en:france,en:france,France,,,,,,,,,,,,,,,,,,,,,,,unknown,unknown,"en:to-be-completed, en:nutrition-facts-completed, en:ingredients-to-be-completed, en:expiration-date-to-be-completed, en:packaging-code-to-be-completed, en:characteristics-to-be-completed, en:origins-to-be-completed, en:categories-to-be-completed, en:brands-to-be-completed, en:packaging-to-be-completed, en:quantity-to-be-completed, en:product-name-completed, en:photos-to-be-validated, en:packaging-photo-to-be-selected, en:nutrition-photo-selected, en:ingredients-photo-to-be-selected, en:front-photo-selected, en:photos-uploaded","en:to-be-completed,en:nutrition-facts-completed,en:ingredients-to-be-completed,en:expiration-date-to-be-completed,en:packaging-code-to-be-completed,en:characteristics-to-be-completed,en:origins-to-be-completed,en:categories-to-be-completed,en:brands-to-be-completed,en:packaging-to-be-completed,en:quantity-to-be-completed,en:product-name-completed,en:photos-to-be-validated,en:packaging-photo-to-be-selected,en:nutrition-photo-selected,en:ingredients-photo-to-be-selected,en:front-photo-selected,en:photos-uploaded","To be completed,Nutrition facts completed,Ingredients to be completed,Expiration date to be completed,Packaging code to be completed,Characteristics to be completed,Origins to be completed,Categories to be completed,Brands to be completed,Packaging to be completed,Quantity to be completed,Product name completed,Photos to be validated,Packaging photo to be selected,Nutrition photo selected,Ingredients photo to be selected,Front photo selected,Photos uploaded",,,,,,https://static.openfoodfacts.org/images/products/999/999/999/9994/front_fr.12.400.jpg,https://static.openfoodfacts.org/images/products/999/999/999/9994/front_fr.12.200.jpg,,,https://static.openfoodfacts.org/images/products/999/999/999/9994/nutrition_fr.14.400.jpg,https://static.openfoodfacts.org/images/products/999/999/999/9994/nutrition_fr.14.200.jpg,,0.0,0.0,,0.2,0.1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.0,7.8,,,,,,,,,,,,5.5,,,,0.15,0.06,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1802750,999999999999999,http://world-en.openfoodfacts.org/product/999999999999999/signal-toothpaste,openfoodfacts-contributors,1587222660,2020-04-18T15:11:00Z,1605558295,2020-11-16T20:24:55Z,Signal Toothpaste,,,,,,,,,"Non food products, Open Beauty Facts, Toothpaste","en:non-food-products,en:open-beauty-facts,en:toothpaste","Non food products,Open Beauty Facts,Toothpaste",,,,,,,,,,,,,,,,France,en:france,France,,,,,,,,,,,,,,,,,,,,,,,unknown,unknown,"en:to-be-completed, en:nutrition-facts-to-be-completed, en:ingredients-to-be-completed, en:expiration-date-to-be-completed, en:packaging-code-to-be-completed, en:characteristics-to-be-completed, en:categories-completed, en:brands-to-be-completed, en:packaging-to-be-completed, en:quantity-to-be-completed, en:product-name-completed, en:photos-to-be-validated, en:packaging-photo-to-be-selected, en:nutrition-photo-to-be-selected, en:ingredients-photo-to-be-selected, en:front-photo-selected, en:photos-uploaded","en:to-be-completed,en:nutrition-facts-to-be-completed,en:ingredients-to-be-completed,en:expiration-date-to-be-completed,en:packaging-code-to-be-completed,en:characteristics-to-be-completed,en:categories-completed,en:brands-to-be-completed,en:packaging-to-be-completed,en:quantity-to-be-completed,en:product-name-completed,en:photos-to-be-validated,en:packaging-photo-to-be-selected,en:nutrition-photo-to-be-selected,en:ingredients-photo-to-be-selected,en:front-photo-selected,en:photos-uploaded","To be completed,Nutrition facts to be completed,Ingredients to be completed,Expiration date to be completed,Packaging code to be completed,Characteristics to be completed,Categories completed,Brands to be completed,Packaging to be completed,Quantity to be completed,Product name completed,Photos to be validated,Packaging photo to be selected,Nutrition photo to be selected,Ingredients photo to be selected,Front photo selected,Photos uploaded",,,,en:toothpaste,Toothpaste,https://static.openfoodfacts.org/images/products/999/999/999/999999/front_en.3.400.jpg,https://static.openfoodfacts.org/images/products/999/999/999/999999/front_en.3.200.jpg,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [194]:
length = len(df1)
print("length =", length)

length = 1802751


In [195]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1802751 entries, 0 to 1802750
Columns: 186 entries, code to carnitine_100g
dtypes: float64(123), int64(2), object(61)
memory usage: 2.5+ GB


# Nettoyage de données (suppression des valeurs aberrantes)

La liste suivante contient les variables que nous allons nettoyer, ainsi que les valeurs min/max des intervalles en dehors desquelles les valeurs sont aberrantes.
Les valeurs aberrantes ont été remplacées par des NaN.

In [196]:
liste_variables_pour_nettoyage = [
                ['energy_100g', 0, 3800],     # 3800kJ correspondent à 100g de lipides
                ['sugars_100g', 0, 100],     # il y a au maximum 100g de sucre dans 100g d'aliment
                ['saturated-fat_100g', 0, 100],
                ['fat_100g', 0, 100],
                ['salt_100g', 0, 100],
                ['proteins_100g', 0, 100], 
                ['fiber_100g', 0, 100], 
                ['cholesterol_100g', 0, 5],     # une valeur supérieure à 5g/100g est aberrante
                ['-lactose_100g', 0, 100],    
                ['nutriscore_score', -15, 40],
                ['energy-kj_100g', 0, 3800],
                ['energy-kcal_100g', 0, 900],
                ['monounsaturated-fat_100g', 0, 100],    
                ['polyunsaturated-fat_100g', 0, 100],
                ['trans-fat_100g', 0, 100],    
                ['carbohydrates_100g', 0, 100],            
                ['alcohol_100g', 0, 100],    
                ['vitamin-a_100g', 0, 0.01],    
                ['vitamin-c_100g', 0, 1],    
                ['calcium_100g', 0, 100],    
                ['iron_100g', 0, 0.1],    
                ['sodium_100g', 0, 100],
]

Exécution du nettoyage

In [197]:
print('Variable'.center(26, ' '), 'Borne min'.center(11, ' '),
      'Borne max'.center(11,
                         ' '), 'Nb de valeurs avant nettoyage'.center(33, ' '),
      'Nb de remplacements NaN'.center(25, ' '), '%remplacés'.center(11, ' '))
print(''.center(123, '-'))

for var in range(len(liste_variables_pour_nettoyage)):
    nom = liste_variables_pour_nettoyage[var][0]
    min_x = liste_variables_pour_nettoyage[var][1]
    max_x = liste_variables_pour_nettoyage[var][2]
    count_avant = df1[nom].count()
    df1[nom] = df1[nom].apply(lambda x: np.nan
                              if (x < min_x or x > max_x) else x)
    count_apres = df1[nom].count()
    print(
        nom.center(26, ' '),
        str(min_x).center(11, ' '),
        str(max_x).center(11, ' '),
        str(count_avant).center(33, ' '),
        str(count_avant - count_apres).center(25, ' '), "{0:.2f}%".format(
            (count_avant - count_apres) / count_avant * 100).center(11, ' '))

         Variable           Borne min   Borne max    Nb de valeurs avant nettoyage    Nb de remplacements NaN   %remplacés
---------------------------------------------------------------------------------------------------------------------------
       energy_100g              0          3800                 1431552                         5409              0.38%   
       sugars_100g              0          100                  1405976                          79               0.01%   
    saturated-fat_100g          0          100                  1380084                          45               0.00%   
         fat_100g               0          100                  1423426                         108               0.01%   
        salt_100g               0          100                  1368694                         877               0.06%   
      proteins_100g             0          100                  1424472                          76               0.01%   
        fiber_1

# Sauvegarde du df nettoyé

In [201]:
df1.to_pickle('df_après_nettoyage.pkl')