In [1]:
import os
import re
import numpy as np
import pandas as pd
import json
from pprint import pprint
from tqdm import tqdm_notebook as tqdm

In [2]:
path = "../raw/www.my-personaltrainer.it"
foods = [food for food in os.listdir(path) if food[0] != "."]
tables = [table.split(".csv")[0] for table in os.listdir("{path}/{food}".format(path=path, food=foods[0])) if table.endswith(".csv")]

In [3]:
foods

['patate bollite senza buccia',
 'menta',
 'castagne arrosto',
 'cocktail frutta sciroppata',
 'agnello cotto al forno',
 'occhiata',
 'uova tuorlo',
 'pizza pomodoro',
 'marsala uovo',
 'muesli',
 'barrettacoccocioccolato',
 'pere',
 'salame_napoli',
 'pecorino romano',
 'salsiccia fegato',
 'spigola',
 'bovino adulto punta di petto',
 'latte evaporato',
 'salmone salamoia',
 'cavolo cappuccio verde crudo',
 'mele golden',
 'margarina animale vegetale',
 'latte polverescremato',
 'finocchi crudi',
 'rana',
 'trippa bovino',
 'storione uova caviale',
 'ananas sciroppato',
 'merluzzo baccala secco',
 'boga',
 'pancetta_arrotolata',
 'salame napoli',
 'fiocchi formaggio',
 'pomodori san marzano',
 'cavolo broccolo verde ramoso crudo',
 'pizza bianca',
 'trota irridea',
 'castagne secche',
 'aringa fresca',
 'radicchio verde',
 'yogurt scremato',
 'salmone fresco',
 'gorgonzola',
 'strutto',
 'sarda',
 'crema nocciole',
 'pollo fuso cotto',
 'cotechino_modena_igp_cotto',
 'farina riso',
 

In [4]:
def parse_value(value):
    if pd.isna(value):
        return np.nan
    elif "%" in value:
        if value[:-1] == "":
            return np.nan
        return float(value[:-1].replace(",","."))
    elif "µg" in value:
        return float(value[:-2].replace(",","."))  / (1e6)
    elif "mg" in value:
        return float(value[:-2].replace(",",".")) / 1000
    elif "g" in value:
        return float(value[:-1].replace(",","."))
    elif value.replace('.','',1).isdigit():
        return float(value)
    elif value.lower() == "tr":
        return 0
    else:
        return np.nan

# Protein Standardization

In [5]:
def parse_proteins(food):
    file_path = "{path}/{name}/{table}.csv".format(path=path, name=food, table=1)
    df = pd.read_csv(file_path)
    df = df.iloc[1:] # Drop the titles
    df.iloc[0,3] = df.iloc[0][2] # set the formatted percentage in the right cell
    df = df.iloc[:,[1,3]]    # drop the useless columns
    df = df.set_index("0")

    parsed_values = []
    renamer = {}
    for name, (value) in df.iterrows():
        value = value[0]

        if "%" in name or (not pd.isna(value) and "%" in value):
            renamer[name] = name.lower() + " | %"
        else:
            renamer[name] = name.lower() + " | g"

        parsed_values.append(parse_value(value))

    df = df.rename(columns={"2":"1"})
    df = df.rename(index=renamer) # rename the columns
    df.iloc[:,[0]] = np.array(parsed_values).reshape((len(df),1)) # update the dataframe with the new parsed data
    return df

# General Standardization

In [6]:
def parse_general(food):
    file_path = "{path}/{name}/{table}.csv".format(path=path, name=foods[8], table=0)
    df = pd.read_csv(file_path)
    df = df.iloc[1:] # Drop the titles
    df = df.iloc[:,[1,2]] # Drop the RDA col
    df = df.set_index("0") # Set the first col as the index
    parsed_values = []
    renamer = {}
    for name, (value) in df.iterrows():
        value = value[0]

        if "%" in name or (not pd.isna(value) and "%" in value):
            renamer[name] = name.lower() + " | %"
        else:
            renamer[name] = name.lower() + " | g"

        parsed_values.append(parse_value(value))


    df = df.rename(index=renamer) # rename the columns
    df.iloc[:,[0]] = np.array(parsed_values).reshape((len(df),1)) # update the dataframe with the new parsed data
    return df

# Fat Standardization

In [7]:
def parse_fats(food):
    file_path = "{path}/{name}/{table}.csv".format(path=path, name=foods[8], table=2)
    df = pd.read_csv(file_path)
    df = df.iloc[1:] # Drop the titles
    df = df.iloc[:,[1,2]] # Drop the index col
    df = df.set_index("0") # Set the first col as the index

    parsed_values = []
    renamer = {}
    for name, (value) in df.iterrows():
        value = value[0]


        if "%" in name or (not pd.isna(value) and "%" in value):    
            if name[0] == "C": # Grassi
                renamer[name] = name.lower().replace(" ",":") + " | %"
            else:
                renamer[name] = name.lower() + " | %"
        else: 
            if name[0] == "C": # Grassi
                renamer[name] = name.lower().replace(" ",":") + " | g"
            else:
                renamer[name] = name.lower() + " | g"

        parsed_values.append(parse_value(value))


    df = df.rename(index=renamer) # rename the columns
    df.iloc[:,[0]] = np.array(parsed_values).reshape((len(df),1)) # update the dataframe with the new parsed data
    return df

# Tables Mergings

In [30]:
proteins, general, fats = parse_proteins(foods[8]), parse_general(foods[8]), parse_fats(foods[8])
row = pd.concat([general,proteins,fats]).T
row = row.rename({"1":foods[8]})
row = row.iloc[:,~row.columns.duplicated()]
del row.columns.name
row

Unnamed: 0,parte edibile | %,acqua | g,carboidrati disponibili | g,carboidrati complessi | g,zuccheri solubili | g,proteine | g,grassi (lipidi) | g,saturi totali | g,monoinsaturi totali | g,polinsaturi totali | g,...,c20:2 | g,c20:3 | g,c20:4 | g,c20:5 | g,c22:0 | g,c22:1 | g,c22:6 | g,c4:0:c10:0 | g,nitrati | g,nitriti | g
marsala uovo,100.0,72.8,12.4,0.0,12.4,0.0,0.0,0.0,0.0,0.0,...,,,0.0,0.0,0.0,0.0,0.0,0.0,,


In [31]:
list(row.columns) == list(set(row.columns))

False

In [32]:
a = list(row.columns)
a.sort()

for c,d  in zip(a[:-1],a[1:]):
    if c == d:
        print(c)

In [33]:
a

['acido aspartico | g',
 'acido glutamico | g',
 'acqua | g',
 'alanina | g',
 'alcol (g) | g',
 'aminoacido limitante | g',
 'arginina | g',
 'c12:0 | g',
 'c14:0 | g',
 'c14:1 | g',
 'c15:0 | g',
 'c16:0 | g',
 'c16:1 | g',
 'c17:0 | g',
 'c17:1 | g',
 'c18:0 | g',
 'c18:1 | g',
 'c18:2 | g',
 'c18:3 | g',
 'c20:0 | g',
 'c20:1 | g',
 'c20:2 | g',
 'c20:3 | g',
 'c20:4 | g',
 'c20:5 | g',
 'c22:0 | g',
 'c22:1 | g',
 'c22:6 | g',
 'c4:0:c10:0 | g',
 'calcio | g',
 'carboidrati complessi | g',
 'carboidrati disponibili | g',
 'cistina | g',
 'colesterolo | g',
 'fenilalanina | g',
 'ferro | g',
 'fibra insolubile | g',
 'fibra solubile | g',
 'fibra totale | g',
 'fosforo | g',
 'glicina | g',
 'grassi (lipidi) | g',
 'indice chimico | g',
 'isoleucina | g',
 'istidina | g',
 'leucina | g',
 'lipidi totali | g',
 'lisina | g',
 'magnesio | g',
 'manganese | g',
 'metionina | g',
 'monoinsaturi totali | g',
 'niacina (vit. b3 o pp) | g',
 'nitrati | g',
 'nitriti | g',
 'parte edibile 