In [1]:
import os
import re
import numpy as np
import pandas as pd
import json
from pprint import pprint
from tqdm import tqdm_notebook as tqdm
from multiprocessing import Pool, cpu_count
from IPython.display import display

### Determining food names and tables

In [2]:
root = "../raw/www.valori-alimenti.com"
foods = [food for food in os.listdir(root) if food[0] != "."]

### Merging csv files

In [3]:
def parse_food(food):
    root = "../raw/www.valori-alimenti.com"
    file_path = "{path}/{name}/0.csv".format(path=root, name=food)
    df = pd.read_csv(file_path)
    if df.empty or df.columns.size == 1:
        return None
    for index, value in df.iterrows():
        if pd.isna(value).any():
            df = df.drop(index)
        elif value[1] == "kj":
            df = df.drop(index)
    df.index = df["0"]
    df = df.drop(columns=["0"])
    for index, (unit, value) in df.iterrows():
        if isinstance(value, str) and "tr" in value:
            value = 0.05
        if unit in ["mg", "mcg", "g", "kcal"]:
            if unit == "mg":
                value = float(value) * 1e-3
                unit = "g"
            elif unit == "mcg":
                value = float(value) * 1e-6
                unit = "g"
            elif unit:
                value = float(value)
            df.loc[index] = value
        else:
            df = df.drop(index)
        new_index = "{index} | {unit}".format(
            index=index, unit=unit).lower().strip().replace("  ", " ")
        df = df.rename(index={index: new_index})
    df = df.drop(columns=["1"])
    df = df.transpose()
    with open("{path}/{name}/metadata.json".format(path=root, name=food),
              "r") as f:
        df["category"] = json.load(f)["category"]
    df.index = [food]
    df.index.name = "name"
    del df.columns.name
    return df

In [4]:
with Pool(cpu_count()) as p:
    valori_alimenti = pd.concat(list(tqdm(p.imap(parse_food, foods), total=len(foods))))

HBox(children=(IntProgress(value=0, max=3263), HTML(value='')))




### Drop rows without required columns

In [5]:
required_columns = [
    "grassi | g", "carboidrati | g", "proteine | g", "fibre | g", "acqua | g",
    "ceneri | g"
]

In [6]:
valori_alimenti = valori_alimenti.drop(index=valori_alimenti.index[np.any(
    pd.isna(valori_alimenti[required_columns]), axis=1)])

### Drop rows whose sum isn't close to 100g

In [7]:
grams = [
    "alcol etilico | g",
    "caffeina | g",
    "teobromina | g",
    "grassi | g",
    "carboidrati | g",
    "proteine | g",
    "fibre | g",
    "acqua | g",
    "ceneri | g",
    "calcio | g",
    "sodio | g",
    "fosforo | g",
    "potassio | g",
    "ferro | g",
    "magnesio | g",
    "zinco | g",
    "rame | g",
    "manganese | g",
    "selenio | g",
    "retinolo (vit. a) | g",
    "betaina | g",
    "tiamina (vit. b1) | g",
    "riboflavina (vit. b2) | g",
    "niacina (vit. b3) | g",
    "acido pantotenico (vit. b5) | g",
    "piridossina (vit. b6) | g",
    "acido folico (vit. b9 o m o folacina) | g",
    "folato alimentare | g",
    "folati, totali | g",
    "cobalamina (vit. b12) | g",
    "vitamina b-12, aggiunta | g",
    "acido ascorbico (vit. c) | g",
    "vitamina d (d2+d3) | g",
    "vitamina d3 | g",
    "alpha-tocoferolo (vit. e) | g",
    "vitamina e, aggiunta | g",
    "fillochinone (vit. k) | g",
    "colina totale (vit. j) | g",
    "carotene, beta | g",
    "carotene, alfa | g",
    "criptoxantina, beta | g",
    "licopene | g",
    "luteina + zeaxantina | g"
]

In [8]:
to_normalize = list(set(valori_alimenti.columns) - set(["category"]))

In [9]:
with open("./sanitization_parameters.json", "r") as f:
    window = json.load(f)["grams_maximal_window"]

In [10]:
def _and(*args):
    return np.all(args, axis=0)

def _or(*args):
    return np.any(args, axis=0)

In [11]:
np.mean(pd.isna(valori_alimenti.values))

0.4249677283075551

In [12]:
sums = np.sum(valori_alimenti[grams], axis=1)

In [13]:
sums = np.sum(valori_alimenti[grams], axis=1)
around_100g = valori_alimenti.iloc[_and(sums < 100 + window,
                                        sums > 100 - window)].copy()
around_100g[to_normalize] = around_100g[to_normalize].divide(
    np.sum(around_100g[grams], axis=1), axis="index") * 100

valori_alimenti = around_100g

In [14]:
np.mean(pd.isna(valori_alimenti.values))

0.42619198263401287

In [15]:
valori_alimenti.to_csv("../csv/valori_nutrizionali.csv")