In [1]:
import os
import re
import numpy as np
import pandas as pd
import json
from pprint import pprint
from tqdm import tqdm_notebook as tqdm
from multiprocessing import Pool, cpu_count

### Determining food names and tables

In [2]:
path = "../raw/www.bda-ieo.it"
foods = [food for food in os.listdir(path) if food[0] != "."]

### Merging csv files

In [3]:
def create_row(food):
    path = "../raw/www.bda-ieo.it"
    file_path = "{path}/{name}/0.csv".format(path=path, name=food)
    df = pd.read_csv(file_path, usecols=("0", "1"))
    for index, value in df.iterrows():
        if pd.isna(value).any():
            df = df.drop(index)
    df= df.reset_index().drop(columns=["index"])
    df.columns = df.iloc[0]
    df.index = df["Componenti Alimentari"]
    df = df.drop(columns=["Componenti Alimentari"])
    df = df.drop("Componenti Alimentari")
    floats = []
    df = df.drop([c for c in df.index if "kJ" in c])
    for index, value in df.iterrows():
        value = value["Valore"]
        if isinstance(value, str) and "tr" in value:
            value = 0.05
        if any([unit in index for unit in ["mg", "mcg", "µg"]]):
            if ", mg" in index:
                value = float(value)*1e-3
            elif ", mcg" in index or ", µg" in index:
                value = float(value)*1e-6
        else:
            value = float(value)
        df.loc[index] = value
        replace = {
            "mg":"g",
            "µg": "g",
            "mcg": "g",
            "kcal":"kcal",
            "g":"g",
            "%": "%"
        }
        new_index = index
        for unit in replace:
            new_index = new_index.replace(", {unit}".format(unit=unit), " | {unit}".format(unit=replace[unit]))
        new_index = new_index.replace("  ", " ")
        df = df.rename(index={index:new_index})
    df = df.transpose()
    del df.columns.name
    df.index = [food]
    df.index.name = "name"
    return df.astype(float)

In [4]:
with Pool(cpu_count()) as p:
    bda = pd.concat(list(tqdm(p.imap(create_row, foods), total=len(foods))))

HBox(children=(IntProgress(value=0, max=978), HTML(value='')))




### Dropping rows without most important columns

In [5]:
required_columns = [
    'Proteine totali | g',
    'Lipidi totali | g',
    'Colesterolo | g',
    'Carboidrati disponibili (MSE) | g',
    'Fibra alimentare totale | g',
    'Alcol | g',
    'Acqua | g',
]

In [6]:
bda = bda.loc[~np.any(pd.isna(bda[required_columns]), axis=1)]

### Normalizing to 100g

In [7]:
grams = [
    'Proteine totali | g', 'Lipidi totali | g', 'Colesterolo | g',
    'Carboidrati disponibili (MSE) | g', 'Fibra alimentare totale | g',
    'Alcol | g', 'Acqua | g', 'Ferro | g', 'Calcio | g', 'Sodio | g',
    'Potassio | g', 'Fosforo | g', 'Zinco | g', 'Magnesio | g', 'Rame | g',
    'Selenio | g', 'Cloro | g', 'Iodio | g', 'Manganese | g', 'Zolfo | g',
    'Vitamina B1, Tiamina | g', 'Vitamina B2, Riboflavina | g',
    'Vitamina C | g', 'Niacina | g', 'Vitamina B6 | g', 'Folati totali | g',
    'Acido pantotenico | g', 'Biotina | g', 'Vitamina B12 | g',
    'Retinolo eq. (RE) | g', 'Retinolo | g', 'ß-carotene eq. | g',
    'Vitamina E (ATE) | g', 'Vitamina D | g', 'Vitamina K | g'
]


def _and(*args):
    return np.all(args, axis=0)


def _or(*args):
    return np.any(args, axis=0)

In [8]:
to_normalize = list(set(bda.columns) - set([c for c in bda.columns if "%" in c]))

In [9]:
with open("./sanitization_parameters.json", "r") as f:
    window = json.load(f)["grams_maximal_window"]
sums = np.sum(bda[grams], axis=1)
around_100g = bda.iloc[_and(sums<100+window, sums>100-window)].copy()
around_100g[to_normalize] = around_100g[to_normalize].divide(np.sum(around_100g[grams], axis=1), axis="index")*100
bda = around_100g

In [10]:
bda.to_csv("../csv/bda.csv")