In [1]:
import os
import re
import numpy as np
import pandas as pd
import json
from pprint import pprint
from tqdm import tqdm_notebook as tqdm
import re
from multiprocessing import Pool, cpu_count
regex = r"(\([^\)]+\))"

In [2]:
path = "../raw/www.my-personaltrainer.it"
foods = [food for food in os.listdir(path) if food[0] != "."]
tables = [table.split(".csv")[0] for table in os.listdir("{path}/{food}".format(path=path, food=foods[0])) if table.endswith(".csv")]

# Functions

In [3]:
def parse_value(value):
    if pd.isna(value):
        return np.nan
    elif "%" in value:
        if value[:-1] == "":
            return np.nan
        return float(value[:-1].replace(",","."))
    elif "µg" in value:
        return float(value[:-2].replace(",","."))  / (1e6)
    elif "mg" in value:
        return float(value[:-2].replace(",",".")) / 1000
    elif "g" in value:
        return float(value[:-1].replace(",","."))
    elif value.replace('.','',1).isdigit():
        return float(value)
    elif value.lower() == "tr":
        return 0
    else:
        return np.nan

In [4]:
def parse_proteins(food):
    file_path = "{path}/{name}/{table}.csv".format(path=path, name=food, table=1)
    df = pd.read_csv(file_path)
    df = df.iloc[1:] # Drop the titles
    df.iloc[0,3] = df.iloc[0][2] # set the formatted percentage in the right cell
    df = df.iloc[:,[1,3]]    # drop the useless columns
    df = df.set_index("0")

    parsed_values = []
    renamer = {}
    for name, (value) in df.iterrows():
        value = value[0]
        old_name = name
        
        name = name.lower().strip()
        name = re.sub(regex, "", name, 0, re.MULTILINE)
        name = name.replace("  "," ")
        
        if "%" in name or (not pd.isna(value) and "%" in value):
            renamer[old_name] = name + " | %"
        else:
            renamer[old_name] = name + " | g"

        parsed_values.append(parse_value(value))

    df = df.rename(columns={"2":"1"})
    df = df.rename(index=renamer) # rename the columns
    df.iloc[:,[0]] = np.array(parsed_values).reshape((len(df),1)) # update the dataframe with the new parsed data
    return df

In [5]:
def parse_general(food):
    file_path = "{path}/{name}/{table}.csv".format(path=path, name=food, table=0)
    df = pd.read_csv(file_path)
    df = df.iloc[1:] # Drop the titles
    df = df.iloc[:,[1,2]] # Drop the RDA col
    df = df.set_index("0") # Set the first col as the index
    parsed_values = []
    renamer = {}
    for name, (value) in df.iterrows():
        value = value[0]
        old_name = name
    
        name = name.lower().strip()
        name = re.sub(regex, "", name, 0, re.MULTILINE)
        name = name.replace("  "," ")
        
        if "%" in name or (not pd.isna(value) and "%" in value):
            renamer[old_name] = name + " | %"
        else:
            renamer[old_name] = name + " | g"

        parsed_values.append(parse_value(value))


    df = df.rename(index=renamer) # rename the columns
    df.iloc[:,[0]] = np.array(parsed_values).reshape((len(df),1)) # update the dataframe with the new parsed data
    return df

In [6]:
def parse_fats(food):
    file_path = "{path}/{name}/{table}.csv".format(path=path, name=food, table=2)
    df = pd.read_csv(file_path)
    df = df.iloc[1:] # Drop the titles
    df = df.iloc[:,[1,2]] # Drop the index col
    df = df.set_index("0") # Set the first col as the index

    parsed_values = []
    renamer = {}
    for name, (value) in df.iterrows():
        value = value[0]
        old_name = name
        
        
        if name[0] == "C": # Grassi
            name = name.replace(" ",":")

        name = name.lower().strip()
        name = re.sub(regex, "", name, 0, re.MULTILINE) 
        name = name.replace("  "," ")
        
        if "%" in name or (not pd.isna(value) and "%" in value):    
            renamer[old_name] = name + " | %"
        else: 
            renamer[old_name] = name + " | g"

        parsed_values.append(parse_value(value))


    df = df.rename(index=renamer) # rename the columns
    df.iloc[:,[0]] = np.array(parsed_values).reshape((len(df),1)) # update the dataframe with the new parsed data
    return df

In [7]:
def parse_category(food):
    file_path = "{path}/{name}/metadata.json".format(path=path, name=food)
    with open(file_path,"r") as f:
        dic = json.load(f)  
    df = pd.DataFrame(dic["category"],["category"],["1"])
    df.columns.name = "0"
    return df

# Merge all the data

In [8]:
def get_single_row(food):
    proteins, general, fats, category = parse_proteins(food), parse_general(food), parse_fats(food), parse_category(food)
    single_row = pd.concat([general,proteins,fats,category]).T
    single_row = single_row.rename({"1":food})
    single_row = single_row.iloc[:,~single_row.columns.duplicated()]
    del single_row.columns.name
    return single_row

In [None]:
with Pool(cpu_count()) as p:
    final_df = pd.concat(list(tqdm(p.imap(get_single_row, foods), total=len(foods))),sort=True)

HBox(children=(IntProgress(value=0, max=706), HTML(value='')))




In [None]:
final_df.columns[np.mean(pd.notna(final_df))<0.05]

In [None]:
final_df = final_df.drop(columns=final_df.columns[np.mean(pd.notna(final_df))<0.05])
final_df.index.name = "name"

In [None]:
for i , col in enumerate(final_df.columns):
    if col != "category":
        final_df.iloc[:,i] = final_df.iloc[:,i].astype(np.float)

In [None]:
final_df.to_csv("../csv/my_personal_trainer.csv")

In [None]:
final_df.describe()