In [1]:
import os
import re
import numpy as np
import pandas as pd
import json
from pprint import pprint
from tqdm import tqdm_notebook as tqdm
import re
from multiprocessing import Pool, cpu_count
from IPython.display import display

In [2]:
path = "../raw/www.cibo360.it"
foods = [food for food in os.listdir(path) if food[0] != "."]
tables = [table.split(".csv")[0] for table in os.listdir("{path}/{food}".format(path=path, food=foods[0])) if table.endswith(".csv")]

In [3]:
tables

['minerali',
 'acidi_grassi',
 'vitamine',
 'composizione_chimica',
 'aminoacidi',
 'energia']

In [4]:
coefficents = {
    "g":1,
    "mg":1e-3,
    "µg":1e-6,
    "%":1
}

# Minerali parser

In [5]:
def parse_minerali(food):
    file_path = "{path}/{name}/{table}.csv".format(path=path, name=food, table="minerali")
    if not os.path.exists(file_path):
        return None
    df = pd.read_csv(file_path)
    df = df.set_index("0")
    df /= 1000 # mg to g
    return df

# acidi_grassi

In [6]:
def parse_acidi_grassi(food):
    file_path = "{path}/{name}/{table}.csv".format(path=path, name=food, table="acidi_grassi")
    if not os.path.exists(file_path):
        return None
    df = pd.read_csv(file_path)
    df = df.set_index("0")
    # already in g
    return df

# vitamine

In [7]:
def parse_vitamine(food):
    file_path = "{path}/{name}/{table}.csv".format(path=path, name=food, table="vitamine")
    if not os.path.exists(file_path):
        return None
    df = pd.read_csv(file_path)
    df = df.set_index("0")
    units = df["2"].tolist()
    coeff = [coefficents[x] for x in units]
    df["1"] = df["1"] * coeff
    # result in g
    return df.iloc[:, :-1] # drop the units col

# composizione_chimica

In [8]:
def parse_composizione_chimica(food):
    file_path = "{path}/{name}/{table}.csv".format(path=path, name=food, table="composizione_chimica")
    if not os.path.exists(file_path):
        return None
    df = pd.read_csv(file_path)
    df = df.set_index("0")
    units = df["2"].tolist()
    coeff = [coefficents[x] if pd.notna(x) else 1 for x in units]
    df["1"] = df["1"] * coeff
    df.drop("di cui", inplace=True)
    # result in g
    return df.iloc[:, :-1] # drop the units col

# aminoacidi

In [9]:
def parse_aminoacidi(food):
    file_path = "{path}/{name}/{table}.csv".format(path=path, name=food, table="aminoacidi")
    if not os.path.exists(file_path):
        return None
    df = pd.read_csv(file_path)
    df = df.set_index("0")
    
    # Get parte edibile per cibo
    file_path = "{path}/{name}/{table}.csv".format(path=path, name=food, table="composizione_chimica")
    if not os.path.exists(file_path):
        return None
    df2 = pd.read_csv(file_path)
    df2 = df2.set_index("0")
    parte_edibile = df2.loc["Parte edibile"][0]
    
    df = df.iloc[:, :-1] # drop the last col
    
    # normalize the first column
    df[:-2] = df[:-2].astype(float) * (parte_edibile/100)
    
    return df

# energia

In [10]:
def parse_energia(food):
    file_path = "{path}/{name}/{table}.csv".format(path=path, name=food, table="energia")
    if not os.path.exists(file_path):
        return None
    df = pd.read_csv(file_path)
    df = df.set_index("0")
    return df.head(1)

# merge all the rows in one big csv

In [11]:
# functions to parse all the file of the food
functions = [
    parse_minerali,
    parse_acidi_grassi,
    parse_vitamine,
    parse_composizione_chimica,
    parse_aminoacidi,
    parse_energia
]

In [12]:
# collect all the file for the single food and concat them
def get_single_row(food):
    csvs = [f(food) for f in functions]
    csvs = [x for x in csvs if x is not None]
    if csvs == []:
        return pd.DataFrame()
    single_row = pd.concat(csvs).T
    single_row = single_row.rename({"1":food})
    single_row = single_row.iloc[:,~single_row.columns.duplicated()]
    del single_row.columns.name
    return single_row

In [13]:
# merge all the rows in just one csv
with Pool(cpu_count()) as p:
    mpt = pd.concat(list(tqdm(p.imap(get_single_row, foods), total=len(foods))))

HBox(children=(IntProgress(value=0, max=850), HTML(value='')))




# adding Unit

In [14]:
def get_unit(col):
    if col in ['Rapporto Polinsaturi/Saturi','Parte edibile']:
        return "%"
    elif col in ['Indice chimico','Aminoacido limitante']:
        return ""
    else:
        return "g"

In [15]:
mpt.columns = [x + " | " + get_unit(x) if x != "kcal" else "calorie | kcal" for x in mpt.columns]
mpt.columns = [x.lower().strip() for x in mpt.columns]

In [16]:
regex = r"(\(\s*\))"

In [17]:
mpt.index = [re.sub(regex, "", x, 0, re.MULTILINE).strip().lower() for x in mpt.index]

In [18]:
mpt

Unnamed: 0,calcio | g,ferro | g,fosforo | g,magnesio | g,potassio | g,rame | g,selenio | g,zinco | g,grassi saturi totali | g,c4:0\÷c10:0 (laurico) | g,...,valina | g,metionina | g,isoleucina | g,leucina | g,tirosina | g,fenilalanina | g,triptofano | g,indice chimico |,aminoacido limitante |,calorie | kcal
capocollo,0.025,0.001,0.263,,,,,,,,...,,,,,,,0,0,,450.00
"zucchine, scure crude (cucurbita pepo)",,,,0.025,,0.00014,0.001,0.0009,,,...,,,,,,,0,0,,11.00
"vitello, grasso separato (bos taurus)",,,,,,,,,,,...,,,,,,,0,0,,
sego di bue (bos taurus),0,0.0003,0.007,0,0,0,0.0002,0,49.8,0,...,,,,,,,,,,872.00
tarassaco o dente di leone (taraxacum officinale),0.316,0.0032,0.065,,0.44,0.00017,0.0005,0.00041,0.17,0,...,,,,,,,0,0,,36.00
"pollo intero con pelle, cotto [arrosto di rosticceria] (gallus gallus)",0.01,0.0008,0.18,,0.27,,,,,,...,,,,,,,0,0,,246.00
"orata fresca d'allevamento, filetti (sparus auratus)",0.03,,1.05,,,,,,1.94,0,...,969,780,870,999.99,,999.99,259,100,Leu.,159.00
olio di fegato di merluzzo (merluccius merluccius),0,0,0,0,0,,0,0,22.6,0,...,,,,,,,0,0,,899.00
patate crude (solanum tuberosum),0.01,0.0006,0.054,0.028,0.57,0.00019,,0.00124,,,...,96.28,28.22,76.36,101.26,50.63,77.19,23.24,88,Leu.,85.00
funghi freschi,,0.0009,0.097,,,,,,,,...,,,,,,,,,,28.00


# Normalize per 100G

In [19]:
mpt.columns

Index(['calcio | g', 'ferro | g', 'fosforo | g', 'magnesio | g',
       'potassio | g', 'rame | g', 'selenio | g', 'zinco | g',
       'grassi saturi totali | g', 'c4:0\÷c10:0 (laurico) | g',
       'c12:0 (laurico) | g', 'c14:0 (miristico) | g', 'c16:0 (palmitico) | g',
       'c18:0 (stearico) | g', 'c20:0 (arachidico) | g', 'c22:0 (beenico) | g',
       'grassi monoinsaturi totali | g', 'c14:1 (miristoleico) | g',
       'c16:1 (palmitoleico) | g', 'c18:1 (oleico) | g',
       'c20:1 (eicosaenoico) | g', 'c22:1 (erucico) | g',
       'grassi polinsaturi totali | g', 'c18:2 w6 (linoleico) | g',
       'c18:3 w3 (linolenico) | g', 'c20:4 w6 (arachidonico) | g',
       'c20:5 w3 (eicosapentaenoico - epa) | g',
       'c22:6 w3 (docosaesaenoico - dha) | g',
       'rapporto polinsaturi/saturi | %', 'tiamina (b1) | g',
       'riboflavina (b2) | g', 'niacina (b3) | g',
       'vitamina a (retinolo eq.) | g', 'vitamina c | g', 'vitamina e | g',
       'parte edibile | %', 'acqua | g', 'pr

In [20]:
mpt.to_csv("../csv/cibo360.csv")