In [1]:
import os
import re
import numpy as np
import pandas as pd
import json
from pprint import pprint
from tqdm import tqdm_notebook as tqdm
from multiprocessing import Pool, cpu_count

### Determining food names and tables

In [2]:
root = "../raw/blia.it"
foods = [food.split(".csv")[0] for food in os.listdir(root) if food.endswith(".csv")]

### Merging csv files and normalizing columns and unit

In [3]:
def build(food):
    initial_index_name = "valori"
    final_index_name = "name"
    units_column = "u.m."
    value_column = "quantità"
    factors = {
        "mg":1e-3,
        "mcg":1e-6
    }
    root = "../raw/blia.it"
    path = "{root}/{food}.csv".format(root=root, food=food)
    df = pd.read_csv(path, index_col=0, usecols=[0, 1, 2, 3])
    df = df.rename(columns=df.iloc[0]).drop(0)
    df.columns = [c.strip().lower() for c in [initial_index_name, *df.columns[1:]]]
    df.index = df[initial_index_name]
    df = df.drop(columns=[initial_index_name])
    for index, (unit, value) in df.iterrows():
        if unit in ["mg", "mcg"]:
            df.loc[index] = "g", float(value)*factors[unit]
        else:
            df.loc[index][value_column] = float(value)
        df = df.rename(index={index: "{index} | {unit}".format(index=index.lower(), unit=df.loc[index][units_column])})
    del df.index.name
    df = df.drop(columns=[units_column]).transpose()
    df.index = [food]
    df.index.name = final_index_name
    return df.astype(float)

In [4]:
with Pool(cpu_count()) as p:
    blia = pd.concat(list(tqdm(p.imap(build, foods), total=len(foods))))

HBox(children=(IntProgress(value=0, max=791), HTML(value='')))




In [5]:
grams = [
    "acqua | g",
    "fibra alimentare | g",
    "proteine totali | g",
    "glucidi disponibili | g",
    "lipidi totali | g",
    "alcool | g",
    "acido folico | g",
    "niacina (vit. pp) | g",
    "retinolo | g",
    "carotene | g",
    "vitamina d | g",
    "vitamina e | g",
    "vitamina b6 | g",
    "vitamina c | g",
    "tiamina (vit.b1) | g",
    "zinco | g",
    "fosforo | g",
    "potassio | g",
    "sodio | g",
    "calcio | g",
    "ferro | g"
]

In [6]:
def _and(*args):
    return np.all(args, axis=0)

def _or(*args):
    return np.any(args, axis=0)

In [7]:
with open("./sanitization_parameters.json", "r") as f:
    window = json.load(f)["grams_maximal_window"]
sums = np.sum(blia[grams], axis=1)
around_100g = blia.iloc[_and(sums<100+window, sums>100-window)]
blia = around_100g.divide(np.sum(around_100g[grams], axis=1), axis="index")*100

In [8]:
blia.to_csv("../csv/blia.csv")