In [161]:
import numpy as np
import pandas as pd

In [162]:
# Chemin d'accès au dataset
dataset_directory = "data\en.openfoodfacts.org.products.csv"
# Nombre de lignes à afficher pour le dataset
displayed_rows = 10

In [163]:
df = pd.read_csv(dataset_directory, nrows = displayed_rows, sep='\t', encoding='utf-8')

In [164]:
# Fonction permettant de retourner toutes les colonnes ayant un taux de null égal ou supérieur au null_threshold entré
def null_rate(data, null_threshold):
  null_rate = ((data.isnull().sum() / data.shape[0])*100).sort_values(ascending=False).reset_index()
  null_rate.columns = ['Nom_colonne','Taux_Null']
  result_null_rate = null_rate[null_rate['Taux_Null'] >= null_threshold]
  return result_null_rate

In [165]:
def data_cleaning(dataset, deletion_threshold):
    # Suppression des colonnes entièrement vides
    full_null_rate = null_rate(dataset, 100)
    cols_to_drop = full_null_rate['Nom_colonne']
    dataset.drop(cols_to_drop, axis=1, inplace=True)
    # Suppression des colonnes ayant à hauteur de la valeur deletion_threshold
    partial_null_rate = null_rate(dataset, 100-deletion_threshold)
    cols_to_drop = partial_null_rate['Nom_colonne']
    dataset.drop(cols_to_drop, axis=1, inplace=True)
    # Remplace les valeurs unknown en NaN
    dataset.replace('unknown', np.nan, inplace=True)
    # Suppression des doublons en fonction du code du produit
    dataset.drop_duplicates(subset ="code", keep = 'last', inplace=True)
    return dataset

In [166]:
# Premier argument le dataset à nettoyer et deuxième argument le seuil acceptable de remplissage par colonne pour supprimer le reste
data_cleaning(df, 25)

Unnamed: 0,code,url,creator,created_t,created_datetime,last_modified_t,last_modified_datetime,last_modified_by,product_name,countries,...,image_nutrition_small_url,energy-kcal_100g,energy_100g,fat_100g,saturated-fat_100g,carbohydrates_100g,sugars_100g,proteins_100g,salt_100g,sodium_100g
0,225,http://world-en.openfoodfacts.org/product/0000...,nutrinet-sante,1623855208,2021-06-16T14:53:28Z,1623855209,2021-06-16T14:53:29Z,nutrinet-sante,jeunes pousses,en:france,...,,,,,,,,,,
1,207025004,http://world-en.openfoodfacts.org/product/0000...,kiliweb,1656948610,2022-07-04T15:30:10Z,1656948613,2022-07-04T15:30:13Z,kiliweb,Andrè,en:de,...,https://images.openfoodfacts.org/images/produc...,165.0,690.0,2.0,2.0,65.0,12.6,1.5,,
2,3429145,http://world-en.openfoodfacts.org/product/0000...,kiliweb,1630483911,2021-09-01T08:11:51Z,1656488106,2022-06-29T07:35:06Z,stephane,L.casei,Spain,...,https://images.openfoodfacts.org/images/produc...,,,1.4,0.9,9.8,9.8,2.7,0.1,0.04
3,26772226,http://world-en.openfoodfacts.org/product/0000...,kiliweb,1654250311,2022-06-03T09:58:31Z,1654270474,2022-06-03T15:34:34Z,quentinbrd,Skyr,France,...,https://images.openfoodfacts.org/images/produc...,57.0,238.0,0.2,0.1,3.9,3.9,10.0,0.09,0.036
4,17,http://world-en.openfoodfacts.org/product/0000...,kiliweb,1529059080,2018-06-15T10:38:00Z,1561463718,2019-06-25T11:55:18Z,kiliweb,Vitória crackers,France,...,,375.0,1569.0,7.0,3.08,70.1,15.0,7.8,1.4,0.56
5,31,http://world-en.openfoodfacts.org/product/0000...,isagoofy,1539464774,2018-10-13T21:06:14Z,1539464817,2018-10-13T21:06:57Z,isagoofy,Cacao,France,...,,,,,,,,,,
6,3327986,http://world-en.openfoodfacts.org/product/0000...,kiliweb,1574175736,2019-11-19T15:02:16Z,1624390765,2021-06-22T19:39:25Z,sylvariane,Filetes de pollo empanado,Espagne,...,https://images.openfoodfacts.org/images/produc...,163.9,685.8,1.9,1.0,,,15.3,1.1,0.44
7,4128579,http://world-en.openfoodfacts.org/product/0000...,kiliweb,1649449251,2022-04-08T20:20:51Z,1649449920,2022-04-08T20:32:00Z,roboto-app,Burger Meat Pollo,en:es,...,https://images.openfoodfacts.org/images/produc...,194.0,812.0,11.0,3.9,5.7,0.05,18.0,,
8,4622327,http://world-en.openfoodfacts.org/product/0000...,kiliweb,1619501895,2021-04-27T05:38:15Z,1619501897,2021-04-27T05:38:17Z,kiliweb,Hamburguesas de ternera 100%,en:es,...,https://images.openfoodfacts.org/images/produc...,874.9,3661.0,15.1,6.1,2.6,1.0,15.7,2.1,0.84
9,6021,http://world-en.openfoodfacts.org/product/0000...,kiliweb,1673478017,2023-01-11T23:00:17Z,1673478020,2023-01-11T23:00:20Z,kiliweb,Blueberry Cobbler Coffee,en:us,...,https://images.openfoodfacts.org/images/produc...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
