In [22]:
import os
import re
import numpy as np
import pandas as pd
import json
from pprint import pprint
from tqdm import tqdm_notebook as tqdm
from multiprocessing import Pool, cpu_count

### Determining food names and tables

In [29]:
root = "../raw/www.valori-alimenti.com"
foods = [food for food in os.listdir(root) if food[0] != "."]

### Merging csv files

In [30]:
def parse_food(food):
    root = "../raw/www.valori-alimenti.com"
    file_path = "{path}/{name}/0.csv".format(path=root, name=food)
    df = pd.read_csv(file_path, index_col=0)
    if df.empty:
        return None
    try:
        for index, value in df.iterrows():
            if pd.isna(value).any():
                df = df.drop(index)
        for index, (unit, value) in df.iterrows():
            if isinstance(value, str) and "tr" in value:
                value = 0.05
            if unit not in ["mg", "mcg", "g", "kcal"]:
                df = df.iloc[[0] + list(range(2, len(df.columns)))]
                continue
            if unit in ["mg", "mcg"]:
                if unit == "mg":
                    df.loc[index] = float(value) * 1e-3
                elif unit == "mcg":
                    df.loc[index] = float(value) * 1e-6
                unit = "g"
            df = df.rename(
                index={
                    index:
                    "{index} | {unit}".format(index=index, unit=unit).lower()
                    .strip().replace("  ", " ")
                })
    except Exception as e:
        pprint(food)
        pprint(df)
        raise e
    df = df.drop(columns=["1"])
    df = df.transpose()
    with open("{path}/{name}/metadata.json".format(path=root, name=food),
              "r") as f:
        df["category"] = json.load(f)["category"]
    df.index = [food]
    df.index.name = "name"
    del df.columns.name
    return df

In [31]:
from IPython.display import display
with Pool(cpu_count()) as p:
    valori_alimenti = pd.concat(list(tqdm(p.imap(parse_food, foods), total=len(foods))))
valori_alimenti.index.name = "name"
valori_alimenti.to_csv("../csv/valori_nutrizionali.csv")

HBox(children=(IntProgress(value=0, max=3263), HTML(value='')))

In [32]:
valori_alimenti

Unnamed: 0_level_0,"acidi grassi, monoinsaturi | g","acidi grassi, polinsaturi | g","acidi grassi, saturi | g",acido ascorbico (vit. c) | g,acido folico (vit. b9 o m o folacina) | g,acido pantotenico (vit. b5) | g,alpha-tocoferolo (vit. e) | g,amido | g,beta-sistosterolo | g,betaina | g,...,sodio | g,stigmasterolo | g,teobromina | g,tiamina (vit. b1) | g,tocoferolo beta | g,tocoferolo delta | g,tocoferolo gamma | g,"vitamina b-12, aggiunta | g","vitamina e, aggiunta | g",zinco | g
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
pasta sfoglia,,,,,,,0.00055,,,,...,,,0.0,,,,,,0.0000,
nestle - buitoni risotto di mare buitoni,,,0.3,,,,,,,,...,0.270,,,,,,,,,
loacker classic chocolat dark noir 118 grammi,,,,,,,,,,,...,,,,,,,,,,
"croissant, al burro",,,,,,,0.00084,,,,...,,,0.0,,,,,,0.0000,
"lombo di maiale, magro, brasato",,,,,,,0.00021,,,,...,,,0.0,,,,,,0.0000,
muffin inglese preparati con farina di frumento,,,,,,,0.00045,,,,...,,,0.0,,,,,,0.0000,
loacker chip choc dark noir,,,,,,,,,,,...,,,,,,,,,,
caviale,,,,,,,0.00189,,,,...,,,0.0,,,,,,0.0000,
barilla - mulino bianco pan di casa - filone grano tenero,,,2.0,,,,,,,,...,0.660,,,,,,,,,
ricotta di pecora,,,,,,,0.00011,,,,...,,,0.0,,,,,,0.0000,


In [33]:
np.mean(np.mean(pd.isna(valori_alimenti)))

0.8251546736525277