In [1]:
import os
import re
import numpy as np
import pandas as pd
import json
from pprint import pprint
from tqdm import tqdm_notebook as tqdm

In [2]:
path = "../raw/www.my-personaltrainer.it"
foods = [food for food in os.listdir(path) if food[0] != "."]
tables = [table.split(".csv")[0] for table in os.listdir("{path}/{food}".format(path=path, food=foods[0])) if table.endswith(".csv")]

In [3]:
foods

['patate bollite senza buccia',
 'menta',
 'castagne arrosto',
 'cocktail frutta sciroppata',
 'agnello cotto al forno',
 'occhiata',
 'gelato panna2',
 'uova tuorlo',
 'pizza pomodoro',
 'marsala uovo',
 'muesli',
 'barrettacoccocioccolato',
 'pere',
 'salame_napoli',
 'pecorino romano',
 'salsiccia fegato',
 'tacchino crudo',
 'spigola',
 'bovino adulto punta di petto',
 'tabelle nutrizionali oliiegrassi',
 'latte evaporato',
 'salmone salamoia',
 'cavolo cappuccio verde crudo',
 'mele golden',
 'pernice',
 'margarina animale vegetale',
 'latte polverescremato',
 'finocchi crudi',
 'rana',
 'trippa bovino',
 'storione uova caviale',
 'ananas sciroppato',
 'merluzzo baccala secco',
 'boga',
 'pancetta_arrotolata',
 'salame napoli',
 'fiocchi formaggio',
 'pomodori san marzano',
 'cavolo broccolo verde ramoso crudo',
 'lepre',
 'pizza bianca',
 'trota irridea',
 'castagne secche',
 'aringa fresca',
 'radicchio verde',
 'yogurt scremato',
 'salmone fresco',
 'gorgonzola',
 'strutto',
 '

# Protein Standardization

In [13]:
file_path = "{path}/{name}/{table}.csv".format(path=path, name=foods[8], table=1)
df = pd.read_csv(file_path)
df = df.iloc[1:] # Drop the titles
protein_percentage = float(df.iloc[0][2][:-1].replace(",","."))/100   # convert the protein percentage to a float in [0,1]
df.iloc[:,3] = pd.to_numeric(df.iloc[:,3], errors="coerce") * protein_percentage # conver it to float and multiply
df.iloc[0,3] = protein_percentage # set the formatted percentage in the right cell
df = df.iloc[:,[1,3]]    # drop the useless columns
df = df.set_index("0")
df = df.rename(columns={"2":"1"})
proteins = df
proteins

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
Proteine (%):,0.071
Acido aspartico,0.38624
Acido glutamico,2.17899
Alanina,0.23004
Aminoacido limitante,
Arginina,0.26412
Cistina,0.17395
Fenilalanina,0.34932
Glicina,0.2698
Indice chimico,


# General Standardization

In [5]:
def parse_value(value):
    if pd.isna(value):
        return np.nan
    elif "%" in value:
        return float(value[:-1].replace(",","."))  / 100
    elif "mg" in value:
        return float(value[:-2].replace(",",".")) / 1000
    elif "g" in value:
        return float(value[:-1].replace(",","."))
    elif value.replace('.','',1).isdigit():
        return float(value)
    else:
        return np.nan

In [12]:
file_path = "{path}/{name}/{table}.csv".format(path=path, name=foods[8], table=0)
df = pd.read_csv(file_path)
df = df.iloc[1:] # Drop the titles
df = df.iloc[:,[1,2]] # Drop the RDA col
df = df.set_index("0") # Set the first col as the index
df.iloc[:,[0]] = np.array([parse_value(x) for x in df["1"].tolist()]).reshape((len(df),1)) # Parse all the data
general = df
general

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
Parte edibile,1.0
Acqua,41.0
Carboidrati disponibili,41.4
Carboidrati complessi,35.0
Zuccheri solubili,2.9
Proteine,7.1
Grassi (Lipidi),6.6
Saturi totali,
Monoinsaturi totali,
Polinsaturi totali,


# Fat Standardization

In [11]:
file_path = "{path}/{name}/{table}.csv".format(path=path, name=foods[8], table=2)
df = pd.read_csv(file_path)
df = df.iloc[1:] # Drop the titles
df = df.iloc[:,[1,2]] # Drop the index col
df = df.set_index("0") # Set the first col as the index
df.iloc[:,[0]] = np.array([parse_value(x) for x in df["1"].tolist()]).reshape((len(df),1)) # Parse all the data
fats = df
fats

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
Lipidi totali,6.6
Saturi totali,
C12 0,
C14 0,
C14 1,
C15 0,
C16 0,
C16 1,
C17 0,
C17 1,


# Tables Mergings

In [10]:
pd.concat([general,proteins,fats]).T

0,Parte edibile,Acqua,Carboidrati disponibili,Carboidrati complessi,Zuccheri solubili,Proteine,Grassi (Lipidi),Saturi totali,Monoinsaturi totali,Polinsaturi totali,...,C20 4,C20 5,C22 0,C22 1,C22 6,C4 0 c10 0,Monoinsaturi totali.1,Nitrati,Nitriti,Polinsaturi totali.1
1,1.0,41.0,41.4,35.0,2.9,7.1,6.6,,,,...,,,,,,0.0,,,,


In [18]:
list(df.iterrows())[1]

('Acido aspartico', 1    0.38624
 Name: Acido aspartico, dtype: float64)