# Notebook for data cleaning tests

In [1]:
import requests
from io import BytesIO
import pandas as pd

pd.set_option('display.max_columns', None) # Option to force pandas to print every columns (useful for data exploration)

# Web archive URL to keep prevent URL change
DATA_URL = "https://web.archive.org/web/20240423194012/https://naehrwertdaten.ch/wp-content/uploads/2023/08/Base_de_donnees_suisse_des_valeurs_nutritives.xlsx"

def fetch_data(URL):
    response = requests.get(DATA_URL, stream=True)
    response.raise_for_status()
    data = BytesIO(response.content)
    return data 

def clean_data(df):
    df = df.drop_duplicates()
    df = df.drop(columns=["ID V 4.0", "ID SwissFIR", "Densité", "Entrée modifiée"])
    df = df.drop(df.filter(regex=r'^Source.*').columns, axis=1)
    df = df.drop(df.filter(regex=r'^Dérivation de la valeur.*').columns, axis=1)
    df = df.drop(df.filter(regex=r'^Activité de *').columns, axis=1)

    double_parenthesis_pattern = r'\s*\([^)]+\)(?=\s*\([^)]+\))'
    df.columns = df.columns.str.replace(double_parenthesis_pattern, '', regex=True)
    
    df.columns = [col.strip().replace(',', '').replace(' ', '_').lower() for col in df.columns]
    return df

raw_data = pd.read_excel("swiss_data.xlsx", engine = "openpyxl", skiprows=2)
data = clean_data(raw_data)
data.head()

Unnamed: 0,id,nom,synonymes,catégorie,unité_de_matrice,énergie_kilojoules_(kj),énergie_calories_(kcal),lipides_totaux_(g),acides_gras_saturés_(g),acides_gras_mono-insaturés_(g),acides_gras_poly-insaturés_(g),cholestérol_(mg),glucides_disponibles_(g),sucres_(g),amidon_(g),fibres_alimentaires_(g),protéines_(g),sel_(g),alcool_(g),eau_(g),rétinol_(µg),bétacarotène_(µg),vitamine_b1_(mg),vitamine_b2_(mg),vitamine_b6_(mg),vitamine_b12_(µg),niacine_(mg),folate_(µg),acide_pantothénique_(mg),vitamine_c_(mg),vitamine_d_(µg),vitamine_e_(mg),potassium_(mg),sodium_(mg),chlore_(mg),calcium_(mg),magnésium_(mg),phosphore_(mg),fer_(mg),iode_(µg),zinc__(mg),sélénium_(µg)
0,621,"Abricot, au sirop, en conserve, égoutté",,Fruits/Fruits cuits (conserves comprises),par 100 g de portion comestible,264,62,0.1,0.0,0.1,0.0,0,13.7,13.3,0,1.4,0.9,0.0,0,83.0,0,925,0.02,tr.,0.02,0,0.11,6.9,0.12,2.2,0,0.45,140,9.7,4.6,18,4.9,11,0.1,0.5,0.1,0.2
1,14097,"Abricot, avec édulcorant, en conserve, égoutté",,Fruits/Fruits cuits (conserves comprises),par 100 g de portion comestible,171,40,0.1,0.0,0.1,0.0,0,8.2,6.1,0,1.4,0.9,0.0,0,89.4,0,925,0.02,0.02,0.05,0,0.34,3.2,0.16,1.6,0,0.57,180,0,1.1,16,7.6,20,0.2,0.6,0.1,0.1
2,379,"Abricot, cru",,Fruits/Fruits frais,par 100 g de portion comestible,185,44,0.1,0.0,0.1,0.0,0,9.0,6.7,tr.,1.7,0.8,0.0,0,87.1,0,2243,0.02,0.02,0.06,0,0.38,5.6,0.18,3.6,0,0.5,260,tr.,1.0,15,8.4,22,0.2,0.5,0.1,0.1
3,1051,"Abricot, cuit, égoutté (sans adjonction de su...",,Fruits/Fruits cuits (conserves comprises),par 100 g de portion comestible,189,45,0.2,0.0,0.1,0.0,0,9.0,6.7,tr.,1.7,1.0,0.0,0,88.2,0,2103,0.02,0.02,0.05,0,0.38,3.5,0.18,1.8,0,0.63,200,tr.,1.3,18,8.4,22,0.2,0.6,0.1,0.1
4,469,"Abricot, sec",,Fruits/Fruits secs,par 100 g de portion comestible,1010,239,0.5,0.2,0.1,0.2,0,59.1,34.3,<0.4,8.3,2.9,0.0,0,24.7,0,2525,0.01,0.04,0.15,0,2.69,8.9,0.58,0.2,0,3.57,1370,11,5.4,82,50.0,110,1.4,2.7,0.4,0.4
