# Presentation du cas

# Import librairies

In [5]:
import pandas as pd
import numpy as np
import os
import sys
import tqdm as tqdm
import glob
import string
import matplotlib.pyplot as plt
from scipy.stats import rankdata
import warnings
from pandas_profiling import ProfileReport
from sklearn.impute import KNNImputer
warnings.filterwarnings('ignore')

# Telechargements donnees

In [6]:
data=pd.read_csv("../../data/sante/openfoodfacts_search.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9937 entries, 0 to 9936
Columns: 174 entries, code to carnitine_100g
dtypes: float64(123), int64(3), object(48)
memory usage: 13.2+ MB


# Data format

In [7]:
data.columns[:100]

Index(['code', 'url', 'creator', 'created_t', 'last_modified_t',
       'product_name', 'abbreviated_product_name', 'generic_name', 'quantity',
       'packaging', 'packaging_tags', 'packaging_text', 'brands',
       'brands_tags', 'categories', 'categories_tags', 'origins',
       'origins_tags', 'manufacturing_places', 'manufacturing_places_tags',
       'labels', 'labels_tags', 'emb_codes', 'emb_codes_tags', 'cities',
       'cities_tags', 'purchase_places', 'stores', 'countries',
       'ingredients_text', 'allergens', 'allergens_tags', 'traces',
       'traces_tags', 'serving_size', 'serving_quantity', 'no_nutriments',
       'additives_n', 'additives_tags', 'ingredients_from_palm_oil_n',
       'ingredients_from_palm_oil', 'ingredients_from_palm_oil_tags',
       'ingredients_that_may_be_from_palm_oil_n',
       'ingredients_that_may_be_from_palm_oil',
       'ingredients_that_may_be_from_palm_oil_tags', 'nutriscore_score',
       'nutriscore_grade', 'nova_group', 'pnns_groups_1'

In [61]:
data["nutriscore_score"].isna().sum()

2125

In [41]:
temp = [a for a in data["categories"]]
temp= [str(a).split(",") for a in data["categories"]]
temp= [item for sublist in temp for item in sublist]
len((temp))

68765

In [8]:
data.duplicated().sum()

0

In [13]:
data.shape

(9937, 174)

In [6]:
data.isna().sum()

code                     0
url                      0
creator                  0
created_t                0
last_modified_t          0
                      ... 
choline_100g          9935
phylloquinone_100g    9937
beta-glucan_100g      9936
inositol_100g         9935
carnitine_100g        9937
Length: 174, dtype: int64

# Data cleaning

In [87]:
from sklearn.preprocessing import MinMaxScaler

def find_category_mappings(df, variable):
    return {k: i for i, k in enumerate(df[variable].dropna().unique(), 0)}

def integer_encode(df , variable, ordinal_mapping):
    df[variable] = df[variable].map(ordinal_mapping)
    
def imputation(df1 , cols):
    mappin = dict()
    mm = MinMaxScaler()
    df = df1.copy()
    #Encoding dict &amp; Removing nan    
    #mappin = dict()
    for variable in cols:
        mappings = find_category_mappings(df, variable)
        mappin[variable] = mappings

    #Apply mapping
    for variable in cols:
        integer_encode(df, variable, mappin[variable])  

    #Minmaxscaler and KNN imputation 
    sca = mm.fit_transform(df)
    knn_imputer = KNNImputer()
    knn = knn_imputer.fit_transform(sca)
    df.iloc[:,:] = mm.inverse_transform(knn)
    for i in df.columns : 
        df[i] = round(df[i]).astype('int')

    #Inverse transform
    for i in cols:
        inv_map = {v: k for k, v in mappin[i].items()}
        df[i] = df[i].map(inv_map)
    return df

In [113]:
def cleaning_data(data,threshold=0.5):
    data=data.dropna(axis=1,thresh=(data.shape)[0]*threshold)
    
    clean_data=data[["code","product_name","generic_name","pnns_groups_1","pnns_groups_2","additives_n"]]
    for a in data.columns:
        if "100g" in a:
            clean_data[a]=data[a]
        if "tags" in a:
            clean_data[a]=data[a]
        if "_score" in a:
            clean_data[a]=data[a]
    
    clean_data["provided_nutriscore"]=clean_data["nutriscore_score"].isna().astype(int).map({0:1,1:0})
    
    dummies=pd.get_dummies(clean_data[["pnns_groups_1","pnns_groups_2"]])
    clean_data[dummies.columns]=dummies
    
    quant_columns=["additives_n","nutriscore_score","energy-kcal_100g","energy_100g","fat_100g","saturated-fat_100g","carbohydrates_100g",
                       "sugars_100g","proteins_100g","salt_100g","sodium_100g","nutrition-score-fr_100g"] + list(dummies.columns)
    quant_data=clean_data[quant_columns]
    imp=KNNImputer()
    quant_data=pd.DataFrame(imp.fit_transform(quant_data),columns=quant_columns)
    for a in quant_columns:
        clean_data[a]=quant_data[a]
    
    clean_data["additives_n"]=clean_data["additives_n"].round()
        
        
    return clean_data


clean_data=cleaning_data(data,threshold=0.5)
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9937 entries, 0 to 9936
Data columns (total 83 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   code                                            9937 non-null   int64  
 1   product_name                                    9915 non-null   object 
 2   generic_name                                    5612 non-null   object 
 3   pnns_groups_1                                   9397 non-null   object 
 4   pnns_groups_2                                   9937 non-null   object 
 5   additives_n                                     9937 non-null   float64
 6   packaging_tags                                  9818 non-null   object 
 7   brands_tags                                     9885 non-null   object 
 8   categories_tags                                 9887 non-null   object 
 9   origins_tags                             

# Saved data

In [114]:
import pickle

with open("cleaned_data.pkl","wb") as f:
    pickle.dump(clean_data,f)