In [2]:
import os
import re
import numpy as np
import pandas as pd
import json
from pprint import pprint
from tqdm import tqdm_notebook as tqdm
from multiprocessing import Pool, cpu_count

### Determining food names and tables

In [6]:
root = "../raw/it.openfoodfacts.org"
foods = [food for food in os.listdir(root) if food[0] != "."]

### Merging csv files

In [59]:
def parse_food(food):
    root = "../raw/it.openfoodfacts.org"
    file_path = "{path}/{name}/Informazioni nutrizionali.csv".format(
        path=root, name=food)
    try:
        df = pd.read_csv(
            file_path,
            usecols=[
                "Informazioni nutrizionali", "Come vendutoper 100 g / 100 ml"
            ],
            index_col="Informazioni nutrizionali")
    except Exception as e:
        return None
    if (df == "?").any().any():
        return None
    for index, value in df.iterrows():
        value = value["Come vendutoper 100 g / 100 ml"]
        if isinstance(value, str):
            unit = ""
            value = value.replace(".", "").replace(",", ".").replace("<", "")
            if "mg" in value:
                value = float(value.split("mg")[0])*1e-3
                unit = "g"
            elif "µg" in value:
                value = float(value.split("µg")[0])*1e-6
                unit = "g"
            elif "g" in value:
                value = float(value.split("g")[0])
                unit = "g"
            elif re.match("-?[\dA-Z]", value):
                pass
            else:
                raise ValueError(str((value, index)))
            df.loc[index] = value
            if unit:
                df = df.rename(
                    index={
                        index:
                        "{index} | {unit}".format(index=index, unit=unit).lower()
                        .strip().replace("  ", " ")
                    })

    df = df.transpose()
    with open("{path}/{name}/metadata.json".format(path=root, name=food),
              "r") as f:
        for key, value in json.load(f).items():
            if value:
                df[key] = value
    df.index = df["name"]
    del df.columns.name
    return df

In [61]:
from IPython.display import display
with Pool(cpu_count()) as p:
    open_food_facts = pd.concat(list(tqdm(p.imap(parse_food, foods), total=len(foods))))
open_food_facts.index.name = "name"
open_food_facts.to_csv("../csv/openfoodsfacts.csv")

HBox(children=(IntProgress(value=0, max=1260), HTML(value='')))

In [62]:
open_food_facts

Unnamed: 0_level_0,- acidi grassi monoinsaturi | g,- acidi grassi omega 6 | g,- acidi grassi polinsaturi | g,- acidi grassi saturi | g,- acidi grassi trans | g,- amido | g,- colesterolo | g,- lattosio | g,- polialcoli/polioli (alcoli degli zuccheri) | g,- zuccheri | g,...,vitamina b2 (riboflavina) | g,vitamina b3 / vitamina pp (niacina) | g,vitamina b5 (acido pantotenico) | g,vitamina b6 (piridoxina) | g,vitamina b8/b7/h/i (biotina) | g,vitamina b9 (acido folico) | g,vitamina c (acido ascorbico) | g,vitamina d (colecalciferolo) | g,vitamina e (alfa-tocoferolo) | g,zinco | g
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ricotta — Conad — 250 g,,,,9.6,,,,,,3.7,...,,,,,,,,,,
Yaourt — Sterzing-Vipiteno — 125 g e,,,,2.5,,,,,,12,...,,,,,,,,,,
"Piccolinis Pomodoro Mozzarella — Buitoni — 270 g, 9 pizza de 30 g",,,,3.7,,,,,,2.2,...,,,,,,,,,,
Latte parzialmente scremato UHT — Verso Natura — 1 L,,,,1.1,,,,,,5.1,...,,,,,,,,,,
Bevanda a base di avena — Verso Natura — 1 L,,,,0.3,,,,,,7.7,...,,,,,,,,,,
Rigoni Nocciolata crema di cacao e nocciole Prodotto biologico — Rigoni di Asiago — 350 g,,,,6,,,,,,51,...,,,,,,,,,,
Crostata ala ciliegia — Santangelo — 350 g,,,,3.9,,,,,,37,...,,,,,,,,,,
Fondente con pezzi di cocco — Lindt,,,,23,,,,,,46,...,,,,,,,,,,
Gelato base Mandorla — VALSOIA — 50g,,,,15,,,,,,26,...,,,,,,,,,,
Wafer nocciola — Selex — 175 g,,,,24.9,,,,,,39.8,...,,,,,,,,,,


In [63]:
np.mean(np.mean(pd.isna(open_food_facts)))

0.7461003211500229