In [1]:
import os
import re
import numpy as np
import pandas as pd
import json
from pprint import pprint
from tqdm import tqdm_notebook as tqdm
import re
from multiprocessing import Pool, cpu_count
from IPython.display import display
regex = r"(\([^\)]+\))"

In [2]:
path = "../raw/www.my-personaltrainer.it"
foods = [food for food in os.listdir(path) if food[0] != "."]
tables = [table.split(".csv")[0] for table in os.listdir("{path}/{food}".format(path=path, food=foods[0])) if table.endswith(".csv")]

# Functions

In [3]:
def parse_value(value):
    if pd.isna(value):
        return np.nan
    if "%" in value:
        if value[:-1] == "":
            return np.nan
        return float(value[:-1].replace(",","."))
    if "µg" in value:
        return float(value[:-2].replace(",",".")) * 1e-6
    if "mg" in value:
        return float(value[:-2].replace(",",".")) * 1e-3
    if "g" in value:
        return float(value[:-1].replace(",","."))
    if value.replace('.','',1).isdigit():
        return float(value)
    if value.lower() == "tr":
        return 0
    return np.nan

In [4]:
def parse_proteins(food):
    file_path = "{path}/{name}/{table}.csv".format(path=path, name=food, table=1)
    df = pd.read_csv(file_path)
    df = df.iloc[1:] # Drop the titles
    df.iloc[0,3] = df.iloc[0][2] # set the formatted percentage in the right cell
    df = df.iloc[:,[1,3]]    # drop the useless columns
    df = df.set_index("0")

    parsed_values = []
    renamer = {}
    for name, (value) in df.iterrows():
        value = value[0]
        old_name = name
        
        name = name.lower().strip()
        name = re.sub(regex, "", name, 0, re.MULTILINE)
        name = name.replace(":"," ")
        name = name.replace("  "," ")
        
        if "%" in name or (not pd.isna(value) and "%" in value):
            renamer[old_name] = name + " | %"
        else:
            renamer[old_name] = name + " | g"
            
        renamer[old_name] = renamer[old_name].replace("  "," ")

        parsed_values.append(parse_value(value))

    df = df.rename(columns={"2":"1"})
    df = df.rename(index=renamer) # rename the columns
    df.iloc[:,[0]] = np.array(parsed_values).reshape((len(df),1)) # update the dataframe with the new parsed data
    return df.astype(np.float)

In [5]:
def parse_general(food):
    file_path = "{path}/{name}/{table}.csv".format(path=path, name=food, table=0)
    df = pd.read_csv(file_path)
    df = df.iloc[1:] # Drop the titles
    df = df.iloc[:,[1,2]] # Drop the RDA col
    df = df.set_index("0") # Set the first col as the index
    parsed_values = []
    renamer = {}
    for name, (value) in df.iterrows():
        value = value[0]
        old_name = name
    
        name = name.lower().strip()
        name = re.sub(regex, "", name, 0, re.MULTILINE)
        name = name.replace("  "," ")
        
        if "%" in name or (not pd.isna(value) and "%" in value):
            renamer[old_name] = name + " | %"
        else:
            renamer[old_name] = name + " | g"
            
        renamer[old_name] = renamer[old_name].replace("  "," ")

        parsed_values.append(parse_value(value))


    df = df.rename(index=renamer) # rename the columns
    df.iloc[:,[0]] = np.array(parsed_values).reshape((len(df),1)) # update the dataframe with the new parsed data
    return df.astype(np.float)

In [6]:
def parse_fats(food):
    file_path = "{path}/{name}/{table}.csv".format(path=path, name=food, table=2)
    df = pd.read_csv(file_path)
    df = df.iloc[1:] # Drop the titles
    df = df.iloc[:,[1,2]] # Drop the index col
    df = df.set_index("0") # Set the first col as the index

    parsed_values = []
    renamer = {}
    for name, (value) in df.iterrows():
        value = value[0]
        old_name = name
        
        
        if name[0] == "C": # Grassi
            name = name.replace(" ",":")

        name = name.lower().strip()
        name = re.sub(regex, "", name, 0, re.MULTILINE) 
        name = name.replace("  "," ")
        
        if "%" in name or (not pd.isna(value) and "%" in value):    
            renamer[old_name] = name + " | %"
        else: 
            renamer[old_name] = name + " | g"
        
        renamer[old_name] = renamer[old_name].replace("  "," ")
        
        parsed_values.append(parse_value(value))


    df = df.rename(index=renamer) # rename the columns
    df.iloc[:,[0]] = np.array(parsed_values).reshape((len(df),1)) # update the dataframe with the new parsed data
    return df.astype(np.float)

In [7]:
def parse_category(food):
    file_path = "{path}/{name}/metadata.json".format(path=path, name=food)
    with open(file_path,"r") as f:
        dic = json.load(f)  
    df = pd.DataFrame(dic["category"],["category"],["1"])
    df.columns.name = "name"
    return df

# Merge all the data

In [8]:
def get_single_row(food):
    proteins, general, fats, category = parse_proteins(food), parse_general(food), parse_fats(food), parse_category(food)
    single_row = pd.concat([general,proteins,fats,category]).T
    single_row = single_row.rename({"1":food})
    single_row = single_row.iloc[:,~single_row.columns.duplicated()]
    del single_row.columns.name
    return single_row

In [9]:
with Pool(cpu_count()) as p:
    mpt = pd.concat(list(tqdm(p.imap(get_single_row, foods), total=len(foods))))

HBox(children=(IntProgress(value=0, max=706), HTML(value='')))




### Drop rows without required columns

In [10]:
required_columns = [
    "acqua | g", "carboidrati disponibili | g", "proteine | g", "grassi | g"
]

In [11]:
np.mean(pd.isna(mpt.values))

0.5603576487252124

In [12]:
mpt = mpt.drop(index=mpt.index[np.any(pd.isna(mpt[required_columns]), axis=1)])

In [13]:
np.mean(pd.isna(mpt.values))

0.5577830188679245

### Normalize to 100g

In [14]:
grams = [
    'acqua | g', 'carboidrati disponibili | g', 'proteine | g', 'grassi | g',
    'fibra totale | g', 'alcol | g', 'sodio | g', 'potassio | g', 'ferro | g',
    'calcio | g', 'fosforo | g', 'magnesio | g', 'zinco | g', 'rame | g',
    'selenio | g', 'tiamina | g', 'riboflavina | g', 'niacina | g',
    'vitamina a retinolo eq. | g', 'vitamina c | g', 'vitamina e | g',
    'vitamina b6 | g', 'vitamina b12 | g', 'manganese | g'
]

In [15]:
to_normalize = list(set(mpt.columns) - set(["category", "parte edibile | %", "proteine | %"]))

In [16]:
def _and(*args):
    return np.all(args, axis=0)

def _or(*args):
    return np.any(args, axis=0)

In [17]:
with open("./sanitization_parameters.json", "r") as f:
    window = json.load(f)["grams_maximal_window"]

In [18]:
"Before dropping rows not around 100g nan mean is {mean} and shape is {shape}".format(
    mean = np.mean(pd.isna(mpt.values)),
    shape = mpt.shape
)

'Before dropping rows not around 100g nan mean is 0.5577830188679245 and shape is (689, 80)'

In [19]:
sums = np.sum(mpt[grams], axis=1)
around_100g = mpt.iloc[_and(sums<100+window, sums>100-window)].copy()
around_100g[to_normalize] = around_100g[to_normalize].divide(np.sum(around_100g[grams], axis=1), axis="index")*100
mpt = around_100g

In [20]:
"After dropping rows not around 100g nan mean is {mean} and shape is {shape}".format(
    mean = np.mean(pd.isna(around_100g.values)),
    shape = around_100g.shape
)

'After dropping rows not around 100g nan mean is 0.5534805389221557 and shape is (668, 80)'

### Saving results

In [21]:
mpt.to_csv("../csv/my_personal_trainer.csv")