In [1]:
import os
import re
import numpy as np
import pandas as pd
import json
from pprint import pprint
from tqdm import tqdm_notebook as tqdm

In [2]:
path = "../raw/www.my-personaltrainer.it"
foods = [food for food in os.listdir(path) if food[0] != "."]
tables = [table.split(".csv")[0] for table in os.listdir("{path}/{food}".format(path=path, food=foods[0])) if table.endswith(".csv")]

# Functions

In [3]:
def parse_value(value):
    if pd.isna(value):
        return np.nan
    elif "%" in value:
        if value[:-1] == "":
            return np.nan
        return float(value[:-1].replace(",","."))
    elif "µg" in value:
        return float(value[:-2].replace(",","."))  / (1e6)
    elif "mg" in value:
        return float(value[:-2].replace(",",".")) / 1000
    elif "g" in value:
        return float(value[:-1].replace(",","."))
    elif value.replace('.','',1).isdigit():
        return float(value)
    elif value.lower() == "tr":
        return 0
    else:
        return np.nan

In [4]:
def parse_proteins(food):
    file_path = "{path}/{name}/{table}.csv".format(path=path, name=food, table=1)
    df = pd.read_csv(file_path)
    df = df.iloc[1:] # Drop the titles
    df.iloc[0,3] = df.iloc[0][2] # set the formatted percentage in the right cell
    df = df.iloc[:,[1,3]]    # drop the useless columns
    df = df.set_index("0")

    parsed_values = []
    renamer = {}
    for name, (value) in df.iterrows():
        value = value[0]

        if "%" in name or (not pd.isna(value) and "%" in value):
            renamer[name] = name.lower() + " | %"
        else:
            renamer[name] = name.lower() + " | g"

        parsed_values.append(parse_value(value))

    df = df.rename(columns={"2":"1"})
    df = df.rename(index=renamer) # rename the columns
    df.iloc[:,[0]] = np.array(parsed_values).reshape((len(df),1)) # update the dataframe with the new parsed data
    return df

In [5]:
def parse_general(food):
    file_path = "{path}/{name}/{table}.csv".format(path=path, name=foods[8], table=0)
    df = pd.read_csv(file_path)
    df = df.iloc[1:] # Drop the titles
    df = df.iloc[:,[1,2]] # Drop the RDA col
    df = df.set_index("0") # Set the first col as the index
    parsed_values = []
    renamer = {}
    for name, (value) in df.iterrows():
        value = value[0]

        if "%" in name or (not pd.isna(value) and "%" in value):
            renamer[name] = name.lower() + " | %"
        else:
            renamer[name] = name.lower() + " | g"

        parsed_values.append(parse_value(value))


    df = df.rename(index=renamer) # rename the columns
    df.iloc[:,[0]] = np.array(parsed_values).reshape((len(df),1)) # update the dataframe with the new parsed data
    return df

In [6]:
def parse_fats(food):
    file_path = "{path}/{name}/{table}.csv".format(path=path, name=foods[8], table=2)
    df = pd.read_csv(file_path)
    df = df.iloc[1:] # Drop the titles
    df = df.iloc[:,[1,2]] # Drop the index col
    df = df.set_index("0") # Set the first col as the index

    parsed_values = []
    renamer = {}
    for name, (value) in df.iterrows():
        value = value[0]


        if "%" in name or (not pd.isna(value) and "%" in value):    
            if name[0] == "C": # Grassi
                renamer[name] = name.lower().replace(" ",":") + " | %"
            else:
                renamer[name] = name.lower() + " | %"
        else: 
            if name[0] == "C": # Grassi
                renamer[name] = name.lower().replace(" ",":") + " | g"
            else:
                renamer[name] = name.lower() + " | g"

        parsed_values.append(parse_value(value))


    df = df.rename(index=renamer) # rename the columns
    df.iloc[:,[0]] = np.array(parsed_values).reshape((len(df),1)) # update the dataframe with the new parsed data
    return df

# Merge all the data

In [7]:
columns = set([])

In [9]:
for food in foods:
    proteins, general, fats = parse_proteins(food), parse_general(food), parse_fats(food)
    
    single_row = pd.concat([general,proteins,fats]).T
    single_row = single_row.rename({"1":food})
    single_row = single_row.iloc[:,~single_row.columns.duplicated()]
    del single_row.columns.name
    columns |= set(single_row.columns)

In [10]:
final_df = pd.DataFrame(columns=columns)

In [11]:
for food in foods:
    proteins, general, fats = parse_proteins(food), parse_general(food), parse_fats(food)
    
    single_row = pd.concat([general,proteins,fats]).T
    single_row = single_row.rename({"1":food})
    single_row = single_row.iloc[:,~single_row.columns.duplicated()]
    del single_row.columns.name
    # Add to the main csv
    final_df = pd.concat([final_df,single_row],axis=0,sort=True)

In [13]:
final_df.to_csv("../csv/my_personal_trainer.csv")