In [1]:
import os
import re
import numpy as np
import pandas as pd
import json
from pprint import pprint
from tqdm import tqdm_notebook as tqdm
import re

regex = r"(\([^\)]+\))"

In [2]:
path = "../raw/www.my-personaltrainer.it"
foods = [food for food in os.listdir(path) if food[0] != "."]
tables = [table.split(".csv")[0] for table in os.listdir("{path}/{food}".format(path=path, food=foods[0])) if table.endswith(".csv")]

# Functions

In [3]:
def parse_value(value):
    if pd.isna(value):
        return np.nan
    elif "%" in value:
        if value[:-1] == "":
            return np.nan
        return float(value[:-1].replace(",","."))
    elif "µg" in value:
        return float(value[:-2].replace(",","."))  / (1e6)
    elif "mg" in value:
        return float(value[:-2].replace(",",".")) / 1000
    elif "g" in value:
        return float(value[:-1].replace(",","."))
    elif value.replace('.','',1).isdigit():
        return float(value)
    elif value.lower() == "tr":
        return 0
    else:
        return np.nan

In [4]:
def parse_proteins(food):
    file_path = "{path}/{name}/{table}.csv".format(path=path, name=food, table=1)
    df = pd.read_csv(file_path)
    df = df.iloc[1:] # Drop the titles
    df.iloc[0,3] = df.iloc[0][2] # set the formatted percentage in the right cell
    df = df.iloc[:,[1,3]]    # drop the useless columns
    df = df.set_index("0")

    parsed_values = []
    renamer = {}
    for name, (value) in df.iterrows():
        value = value[0]
        old_name = name
        
        name = name.lower().strip()
        name = re.sub(regex, "", name, 0, re.MULTILINE)
        name = name.replace("  "," ")
        
        if "%" in name or (not pd.isna(value) and "%" in value):
            renamer[old_name] = name + " | %"
        else:
            renamer[old_name] = name + " | g"

        parsed_values.append(parse_value(value))

    df = df.rename(columns={"2":"1"})
    df = df.rename(index=renamer) # rename the columns
    df.iloc[:,[0]] = np.array(parsed_values).reshape((len(df),1)) # update the dataframe with the new parsed data
    return df

In [5]:
def parse_general(food):
    file_path = "{path}/{name}/{table}.csv".format(path=path, name=food, table=0)
    df = pd.read_csv(file_path)
    df = df.iloc[1:] # Drop the titles
    df = df.iloc[:,[1,2]] # Drop the RDA col
    df = df.set_index("0") # Set the first col as the index
    parsed_values = []
    renamer = {}
    for name, (value) in df.iterrows():
        value = value[0]
        old_name = name
    
        name = name.lower().strip()
        name = re.sub(regex, "", name, 0, re.MULTILINE)
        name = name.replace("  "," ")
        
        if "%" in name or (not pd.isna(value) and "%" in value):
            renamer[old_name] = name + " | %"
        else:
            renamer[old_name] = name + " | g"

        parsed_values.append(parse_value(value))


    df = df.rename(index=renamer) # rename the columns
    df.iloc[:,[0]] = np.array(parsed_values).reshape((len(df),1)) # update the dataframe with the new parsed data
    return df

In [6]:
def parse_fats(food):
    file_path = "{path}/{name}/{table}.csv".format(path=path, name=food, table=2)
    df = pd.read_csv(file_path)
    df = df.iloc[1:] # Drop the titles
    df = df.iloc[:,[1,2]] # Drop the index col
    df = df.set_index("0") # Set the first col as the index

    parsed_values = []
    renamer = {}
    for name, (value) in df.iterrows():
        value = value[0]
        old_name = name
        
        
        if name[0] == "C": # Grassi
            name = name.replace(" ",":")

        name = name.lower().strip()
        name = re.sub(regex, "", name, 0, re.MULTILINE) 
        name = name.replace("  "," ")
        
        if "%" in name or (not pd.isna(value) and "%" in value):    
            renamer[old_name] = name + " | %"
        else: 
            renamer[old_name] = name + " | g"

        parsed_values.append(parse_value(value))


    df = df.rename(index=renamer) # rename the columns
    df.iloc[:,[0]] = np.array(parsed_values).reshape((len(df),1)) # update the dataframe with the new parsed data
    return df

# Merge all the data

In [7]:
final_df = pd.DataFrame()

In [8]:
for food in tqdm(foods):
    proteins, general, fats = parse_proteins(food), parse_general(food), parse_fats(food)
    
    single_row = pd.concat([general,proteins,fats]).T
    single_row = single_row.rename({"1":food})
    single_row = single_row.iloc[:,~single_row.columns.duplicated()]
    del single_row.columns.name
    # Add to the main csv
    final_df = pd.concat([final_df,single_row],axis=0,sort=True)

HBox(children=(IntProgress(value=0, max=706), HTML(value='')))




In [9]:
final_df.columns[np.mean(pd.notna(final_df))<0.05]

Index(['aminoacido limitante | g', 'c15:0 | g', 'c17:0 | g', 'c17:1 | g',
       'c20:2 | g', 'c20:3 | g', 'indice chimico | g', 'manganese | g',
       'nitrati | g', 'nitriti | g', 'proteine : | g', 'vitamina b12 | g',
       'vitamina b6 | g'],
      dtype='object')

In [10]:
final_df = final_df.drop(columns=final_df.columns[np.mean(pd.notna(final_df))<0.05])
final_df.index.name = "name"

In [11]:
final_df.to_csv("../csv/my_personal_trainer.csv")

In [12]:
final_df.describe()

Unnamed: 0,acido aspartico | g,acido glutamico | g,acqua | g,alanina | g,alcol | g,arginina | g,c12:0 | g,c14:0 | g,c14:1 | g,c16:0 | g,...,tiamina | g,tirosina | g,treonina | g,triptofano | g,valina | g,vitamina a retinolo eq. | g,vitamina c | g,vitamina e | g,zinco | g,zuccheri solubili | g
count,192.0,192.0,695.0,192.0,706.0,192.0,263.0,243.0,263.0,243.0,...,473.0,195.0,199.0,192.0,199.0,391.0,557.0,98.0,226.0,663.0
mean,8.682187,16.177292,56.559281,4.808177,0.339943,5.576458,0.395323,0.43535,0.015171,2.882181,...,0.000163,3.137897,3.670905,0.972604,4.817236,0.000351,0.009187,0.005548,0.002245,7.229713
std,3.760079,7.978817,29.729751,2.07805,2.840071,3.037352,3.419089,1.47177,0.078092,4.468054,...,0.000229,1.402615,1.342522,0.380425,1.757889,0.001875,0.028666,0.016844,0.001517,15.498797
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5e-05,0.0
25%,7.215,14.1775,33.3,3.69,0.0,3.7,0.0,0.005,0.0,0.29,...,3e-05,2.85,3.295,0.9075,4.62,0.0,0.0,0.0,0.00115,0.0
50%,9.64,15.78,67.2,5.55,0.0,6.105,0.0,0.09,0.0,1.14,...,8e-05,3.34,4.08,1.02,5.2,1.5e-05,0.0,0.000215,0.002025,1.5
75%,10.235,17.82,80.0,6.2225,0.0,6.6075,0.01,0.31,0.0,3.98,...,0.0002,3.56,4.495,1.1425,5.635,0.000126,0.005,0.000953,0.003227,5.3
max,21.2,45.7,96.5,10.21,35.0,18.91,44.8,17.0,1.14,41.21,...,0.00244,13.58,5.7,1.8,12.25,0.018,0.34,0.133,0.011,104.5
