In [1]:
import os
import re
import numpy as np
import pandas as pd
import json
from pprint import pprint
from tqdm import tqdm_notebook as tqdm

In [2]:
path = "../raw/www.my-personaltrainer.it"
foods = [food for food in os.listdir(path) if food[0] != "."]
tables = [table.split(".csv")[0] for table in os.listdir("{path}/{food}".format(path=path, food=foods[0])) if table.endswith(".csv")]

# Functions

In [3]:
def parse_value(value):
    if pd.isna(value):
        return np.nan
    elif "%" in value:
        if value[:-1] == "":
            return np.nan
        return float(value[:-1].replace(",","."))
    elif "µg" in value:
        return float(value[:-2].replace(",","."))  / (1e6)
    elif "mg" in value:
        return float(value[:-2].replace(",",".")) / 1000
    elif "g" in value:
        return float(value[:-1].replace(",","."))
    elif value.replace('.','',1).isdigit():
        return float(value)
    elif value.lower() == "tr":
        return 0
    else:
        return np.nan

In [4]:
def parse_proteins(food):
    file_path = "{path}/{name}/{table}.csv".format(path=path, name=food, table=1)
    df = pd.read_csv(file_path)
    df = df.iloc[1:] # Drop the titles
    df.iloc[0,3] = df.iloc[0][2] # set the formatted percentage in the right cell
    df = df.iloc[:,[1,3]]    # drop the useless columns
    df = df.set_index("0")

    parsed_values = []
    renamer = {}
    for name, (value) in df.iterrows():
        value = value[0]

        if "%" in name or (not pd.isna(value) and "%" in value):
            renamer[name] = name.lower() + " | %"
        else:
            renamer[name] = name.lower() + " | g"

        parsed_values.append(parse_value(value))

    df = df.rename(columns={"2":"1"})
    df = df.rename(index=renamer) # rename the columns
    df.iloc[:,[0]] = np.array(parsed_values).reshape((len(df),1)) # update the dataframe with the new parsed data
    return df

In [5]:
def parse_general(food):
    file_path = "{path}/{name}/{table}.csv".format(path=path, name=food, table=0)
    df = pd.read_csv(file_path)
    df = df.iloc[1:] # Drop the titles
    df = df.iloc[:,[1,2]] # Drop the RDA col
    df = df.set_index("0") # Set the first col as the index
    parsed_values = []
    renamer = {}
    for name, (value) in df.iterrows():
        value = value[0]

        if "%" in name or (not pd.isna(value) and "%" in value):
            renamer[name] = name.lower() + " | %"
        else:
            renamer[name] = name.lower() + " | g"

        parsed_values.append(parse_value(value))


    df = df.rename(index=renamer) # rename the columns
    df.iloc[:,[0]] = np.array(parsed_values).reshape((len(df),1)) # update the dataframe with the new parsed data
    return df

In [6]:
def parse_fats(food):
    file_path = "{path}/{name}/{table}.csv".format(path=path, name=food, table=2)
    df = pd.read_csv(file_path)
    df = df.iloc[1:] # Drop the titles
    df = df.iloc[:,[1,2]] # Drop the index col
    df = df.set_index("0") # Set the first col as the index

    parsed_values = []
    renamer = {}
    for name, (value) in df.iterrows():
        value = value[0]


        if "%" in name or (not pd.isna(value) and "%" in value):    
            if name[0] == "C": # Grassi
                renamer[name] = name.lower().replace(" ",":") + " | %"
            else:
                renamer[name] = name.lower() + " | %"
        else: 
            if name[0] == "C": # Grassi
                renamer[name] = name.lower().replace(" ",":") + " | g"
            else:
                renamer[name] = name.lower() + " | g"

        parsed_values.append(parse_value(value))


    df = df.rename(index=renamer) # rename the columns
    df.iloc[:,[0]] = np.array(parsed_values).reshape((len(df),1)) # update the dataframe with the new parsed data
    return df

# Merge all the data

In [7]:
columns = set([])

In [8]:
for food in foods:
    proteins, general, fats = parse_proteins(food), parse_general(food), parse_fats(food)
    
    single_row = pd.concat([general,proteins,fats]).T
    single_row = single_row.rename({"1":food})
    single_row = single_row.iloc[:,~single_row.columns.duplicated()]
    del single_row.columns.name
    columns |= set(single_row.columns)

In [9]:
final_df = pd.DataFrame(columns=columns)

In [10]:
for food in foods:
    proteins, general, fats = parse_proteins(food), parse_general(food), parse_fats(food)
    
    single_row = pd.concat([general,proteins,fats]).T
    single_row = single_row.rename({"1":food})
    single_row = single_row.iloc[:,~single_row.columns.duplicated()]
    del single_row.columns.name
    # Add to the main csv
    final_df = pd.concat([final_df,single_row],axis=0,sort=True)

In [11]:
final_df.to_csv("../csv/my_personal_trainer.csv")

In [12]:
final_df

Unnamed: 0,acido aspartico | g,acido glutamico | g,acqua | g,alanina | g,alcol (g) | g,aminoacido limitante | g,arginina | g,c12:0 | g,c14:0 | g,c14:1 | g,...,treonina | g,triptofano | g,valina | g,vitamina a retinolo eq. | g,vitamina b12 | g,vitamina b6 | g,vitamina c | g,vitamina e | g,zinco | g,zuccheri solubili | g
patate bollite senza buccia,,,78.5,,0.0,,,,,,...,,,,0.000003,,,0.008,,0.00124,0.4
menta,,,86.4,,0.0,,,,,,...,,,,0.000123,,,0.031,,,5.3
castagne arrosto,,,42.4,,0.0,,,,,,...,,,,0.000000,,,,,,10.7
cocktail frutta sciroppata,,,81.8,,0.0,,,0.00,,0.00,...,,,,,,,0.004,,,14.8
agnello cotto al forno,,,42.0,,0.0,,,,,,...,,,,,,,0.000,,0.00582,0.0
occhiata,,,77.0,,0.0,,,0.00,0.12,0.00,...,,,,,,,0.000,,0.00140,2.0
uova tuorlo,,,,,0.0,,,,,,...,,,,,,,0.000,,,
pizza pomodoro,5.44,30.69,41.0,3.24,0.0,,3.72,,,,...,3.08,0.97,5.01,,,,0.000,,0.00046,2.9
marsala uovo,,,72.8,,14.8,,,0.00,0.00,0.00,...,,,,,,,0.000,,,12.4
muesli,,,7.4,,0.0,,,0.00,0.17,0.00,...,,,,,,,,,,26.2
