# Source
http://www.valorinutritivi.ch/request?xml=MessageData&xml=MetaData&xsl=Download&lan=it&pageKey=Start

In [1]:
import pandas as pd
from pprint import pprint

In [2]:
path = "../raw/Swiss Food Comp Data V5.3.xlsx"

In [3]:
df = pd.read_excel(path)

In [4]:
df.columns = df.iloc[1]
df.reindex(df.index.drop(1))
df = df.drop([0,1])
df = df.drop(columns=['ID','ID V 4.0','ID SwissFIR','name D','synonyms D','name F','synonyms F','name E','synonyms E','category D','category F',
 'category E'])
df.columns = ["name","synonyms","category"] + list(df.columns[3:])
df = df.drop(columns=["record has changed",'matrix unit', 'value type', 'source'])
df = df.set_index("name")

In [5]:
df.describe()

Unnamed: 0,synonyms,category,specific gravity,energy kJ,unit,energy kcal,unit.1,protein,unit.2,alcohol,...,magnesium (Mg),unit.3,phosphorus (P),unit.4,iron (Fe),unit.5,iodide (I),unit.6,zinc (Zn),unit.7
count,96,999,124.0,999,999,999,999,999.0,999,996.0,...,991.0,991,991.0,991,992.0,992,970.0,970,992.0,992
unique,67,118,11.0,584,1,426,1,285.0,1,28.0,...,156.0,1,172.0,1,83.0,1,144.0,1,70.0,1
top,Fettina,Verdure/Verdure cotte (incl. conserve),1.0,1490,kilojoule,30,kilocalorie,0.0,gram,0.0,...,18.0,milligram,110.0,milligram,0.5,milligram,0.0,microgram,0.1,milligram
freq,4,53,60.0,12,999,13,999,36.0,999,951.0,...,46.0,991,40.0,991,72.0,992,71.0,970,91.0,992


In [6]:
# Drop energy in kj
df = df.iloc[:,[i for i,x in enumerate(df.columns) if i not in [3,4]]]

In [7]:
# Coefficents to convert units to the sandards (kcal, g)
conversion = {
    "kilocalorie":1,
    "gram":1,
    "milligram":1e-3,
    "microgram":1e-6
}

In [8]:
# normalize the data
cols_to_skips = []
for i in range(5,len(df.columns),2):
    if "retinol equivalent" in df.iloc[:,i+1].tolist():
        cols_to_skips.append(i)
        continue 
    if "beta-carotene equivalent" in df.iloc[:,i+1].tolist():
        cols_to_skips.append(i)
        continue 
    if "alpha-tocopherol equivalent" in df.iloc[:,i+1].tolist():
        cols_to_skips.append(i)
        continue 
    df.iloc[:,i] *= [conversion[x] if not pd.isna(x) else 1 for x in df.iloc[:,i+1] ]

In [9]:
#drop unit cols
index = [0,1,2] + list(range(3,len(df.columns),2))
print(index)
for i in cols_to_skips:
    index.remove(i)
df = df.iloc[:,index]

[0, 1, 2, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73]


In [10]:
# drop everything between () and lower and strip all the columns name
import re
regex = r"(\([^\)]+\))"
df.columns = [re.sub(regex, "", x, 0, re.MULTILINE).lower().strip() for x in df.columns]

In [11]:
tofloat = [
    "energy kcal", 'protein', 'alcohol', 'water', 'carbohydrates, available',
    'starch', 'sugars', 'dietary fibres', 'fat, total', 'cholesterol',
    'fatty acids, monounsaturated', 'fatty acids, saturated',
    'fatty acids, polyunsaturated', 'beta-carotene', 'vitamin b1', 'vitamin b2', 'vitamin b6', 'vitamin b12', 'niacin',
    'folate', 'pantothenic acid', 'vitamin c', 'vitamin d', 'sodium',
    'potassium', 'chloride', 'calcium', 'magnesium', 'phosphorus', 'iron',
    'iodide', 'zinc'
]
df = df.astype({k: float for k in tofloat})

In [12]:
# remove white spaces and add units
df.columns = ['synonyms', 'category','specific gravity'] + ["energy | kcal"] + [re.sub(r"(\s+)", " ", x, 0, re.MULTILINE).strip() + " | g" for x in tofloat[1:]]

In [13]:
# Print alla cols and their type to check if they are correct
df.columns.to_series().groupby(df.dtypes).groups

{dtype('float64'): Index(['energy | kcal', 'protein | g', 'alcohol | g', 'water | g',
        'carbohydrates, available | g', 'starch | g', 'sugars | g',
        'dietary fibres | g', 'fat, total | g', 'cholesterol | g',
        'fatty acids, monounsaturated | g', 'fatty acids, saturated | g',
        'fatty acids, polyunsaturated | g', 'beta-carotene | g',
        'vitamin b1 | g', 'vitamin b2 | g', 'vitamin b6 | g', 'vitamin b12 | g',
        'niacin | g', 'folate | g', 'pantothenic acid | g', 'vitamin c | g',
        'vitamin d | g', 'sodium | g', 'potassium | g', 'chloride | g',
        'calcium | g', 'magnesium | g', 'phosphorus | g', 'iron | g',
        'iodide | g', 'zinc | g'],
       dtype='object'),
 dtype('O'): Index(['synonyms', 'category', 'specific gravity'], dtype='object')}

In [14]:
# Import settings
import json

with open("sanitization_parameters.json","r") as f:
    dic = json.load(f)

grams_maximal_window = dic["grams_maximal_window"]

In [15]:
# Mandatory Cols
minerals = ['sodium | g',
            'potassium | g',
            'chloride | g',
            'calcium | g',
            'magnesium | g',
            'phosphorus | g',
            'iron | g',
            'iodide | g',
            'zinc | g'
           ]

cols = ['dietary fibres | g',
        'alcohol | g',
        'protein | g',
        'water | g',
        'carbohydrates, available | g',
        'fat, total | g'
       ] + minerals

In [16]:
# drop all the rows which don't sum to 100 (with threshold)
import numpy as np

rows_sum = np.sum(df[cols], axis=1)

df = df[
        np.logical_and(
            rows_sum <= 100 + grams_maximal_window,
            rows_sum >= 100 - grams_maximal_window
        )
    ]

In [17]:
df.columns

Index(['synonyms', 'category', 'specific gravity', 'energy | kcal',
       'protein | g', 'alcohol | g', 'water | g',
       'carbohydrates, available | g', 'starch | g', 'sugars | g',
       'dietary fibres | g', 'fat, total | g', 'cholesterol | g',
       'fatty acids, monounsaturated | g', 'fatty acids, saturated | g',
       'fatty acids, polyunsaturated | g', 'beta-carotene | g',
       'vitamin b1 | g', 'vitamin b2 | g', 'vitamin b6 | g', 'vitamin b12 | g',
       'niacin | g', 'folate | g', 'pantothenic acid | g', 'vitamin c | g',
       'vitamin d | g', 'sodium | g', 'potassium | g', 'chloride | g',
       'calcium | g', 'magnesium | g', 'phosphorus | g', 'iron | g',
       'iodide | g', 'zinc | g'],
      dtype='object')

In [18]:
# Normalize at 100 all the float cols
df[df.columns[2:]] = df[df.columns[2:]].divide(np.sum(df[cols], axis=1), axis="index")*100

In [19]:
np.sum(df[cols], axis=1)

name
Agar Agar                                                          100.0
Maccheroni dell'alpigiano, preparati                               100.0
Amaretti (biscotti alle mandorle)                                  100.0
Ananas, zuccherato, conserva                                       100.0
Ananas, crudo                                                      100.0
Ananas, non zuccherato, conserva                                   100.0
Mele, cotte, sgocciolate (senza aggiunta di zucchero)              100.0
Mele, sbucciate, secche                                            100.0
Mela, cruda                                                        100.0
Torta di mele, cotta nel forno (pasta sfoglia)                     100.0
Torta di mele, cotta nel forno (pasta per torta)                   100.0
Mele, purea, zuccherata, conserva                                  100.0
Mele, purea, non zuccherata, conserva                              100.0
Succo di mele                                 

In [20]:
# Save the result
df.to_csv("../csv/confederazione_svizzera.csv")

In [21]:
df

Unnamed: 0_level_0,synonyms,category,specific gravity,energy | kcal,protein | g,alcohol | g,water | g,"carbohydrates, available | g",starch | g,sugars | g,...,vitamin d | g,sodium | g,potassium | g,chloride | g,calcium | g,magnesium | g,phosphorus | g,iron | g,iodide | g,zinc | g
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Agar Agar,,Diversi/Gelificanti e leganti,,162.963,2.44444,,20.8796,0,,,...,,0.132407,0.0529629,,0.672221,0.101852,0.0346296,0.00458332,,0.00152777
"Maccheroni dell'alpigiano, preparati",,Cibi/Altri cibi salati/saporiti,,181.191,6.00688,0,63.6139,18.3161,16.839,1.37863,...,2.9542e-07,0.433283,0.157558,0.679467,0.118168,0.01871,0.118168,0.000393894,6.59772e-06,0.000787788
Amaretti (biscotti alle mandorle),,Dolciumi/Biscotti,,450.24,7.58719,0,0.69882,75.5724,0.998315,75.5724,...,0,0.0359393,0.219629,0.0489174,0.0688837,0.0618955,0.129781,0.00129781,1.99663e-06,0.00149747
"Ananas, zuccherato, conserva",,Frutta/Frutta cotta (incl. conserve),,86.8315,0.399225,0,77.7491,20.1609,0,20.1609,...,0,0.00179651,0.119768,0.0349322,0.0129748,0.0139729,0.00988082,0.000299419,0,9.98063e-05
"Ananas, crudo",,Frutta/Frutta fresca,,50.9835,0.39987,0,86.472,11.2963,0,11.2963,...,0,0.00199935,0.149951,0.0389874,0.0149951,0.0149951,0.0109964,0.000299903,1.39955e-06,9.99676e-05
"Ananas, non zuccherato, conserva",,Frutta/Frutta cotta (incl. conserve),,50.9942,0.399954,0,86.4901,11.2987,0,11.2987,...,0,0.00199977,0.129985,0.0389956,0.0139984,0.0149983,0.0109987,0.000299966,0,9.99886e-05
"Mele, cotte, sgocciolate (senza aggiunta di zucchero)",,Frutta/Frutta cotta (incl. conserve),,72.3821,0.402123,0,80.7262,15.4817,0.100531,15.3812,...,0,0.00532813,0.140743,0.00291539,0.0062329,0.00532813,0.0120637,0.000301592,1.00531e-06,0.000100531
"Mele, sbucciate, secche",,Frutta/Frutta secca,,305.542,1.55361,0,20.7147,65.4586,0.103574,62.1442,...,0,0.0155361,0.538583,0.0103574,0.0155361,0.0321078,0.0580013,0.00207147,1.03574e-05,0.000621442
"Mela, cruda",,Frutta/Frutta fresca,,55.2517,0.301373,0,85.3889,11.7535,0.100458,11.6531,...,0,0.0040183,0.120549,0.00221007,0.00502288,0.0040183,0.00904118,0.000200915,8.03661e-07,0.000100458
"Torta di mele, cotta nel forno (pasta sfoglia)",,"Dolciumi/Dolci, torte e cake;Cibi/Torte dolci ...",,141.423,2.29065,0,72.2052,16.4329,6.87194,9.46137,...,1.99187e-07,0.0936178,0.109553,0.139431,0.0199187,0.00657316,0.0368495,0.000398373,2.9878e-06,0.00029878
