In [1]:
import os
import re
import numpy as np
import pandas as pd
import json
from pprint import pprint
from tqdm import tqdm_notebook as tqdm

### Determining food names and tables

In [2]:
path = "../raw/www.yazio.com"
foods = [food for food in os.listdir(path) if food[0] != "."]
tables = [table.split(".csv")[0] for table in os.listdir("{path}/{food}".format(path=path, food=foods[0])) if table.endswith(".csv")]

### Merging csv files

In [3]:
from IPython.display import display

yazio = pd.DataFrame([])
yazio.index.name="name"
for path_name in tqdm(foods):
    name = path_name.replace("-", " ")
    df = pd.DataFrame([], columns=["per porzione"])
    for table in tables:
        file_path = "{path}/{name}/{table}.csv".format(path=path, name=path_name, table=table)
        if os.path.exists(file_path):
            df = df.append(pd.read_csv(file_path, usecols=["per porzione", table], index_col=table))
    df = df.transpose()
    with open("{path}/{name}/metadata.json".format(path=path, name=path_name), "r") as f:
        df["category"]= json.load(f)["category"]
    df.index = [name]
    df.index.name = yazio.index.name
    yazio = yazio.append(df)

HBox(children=(IntProgress(value=0, max=2762), HTML(value='')))




### Adding units to columns

In [None]:
units = [[re.findall("[a-z]+", v)[0].replace("mg", "g") for v in yazio[c].values if pd.notna(v)][0] for c in yazio.columns[:-1]]

In [None]:
new_columns = ["{c} | {unit}".format(c=c, unit=u).lower() for c, u in zip(*[yazio.columns[:-1], units])]

In [None]:
yazio.columns = new_columns + [yazio.columns[-1]]

In [None]:
yazio.columns

Index(['acqua | g', 'calcio | g', 'carboidrati | g', 'colesterolo | g',
       'ferro | g', 'fibre alimentari | g', 'fosforo | g', 'grassi | g',
       'grassi monoinsaturi | g', 'grassi polinsaturi | g',
       'grassi saturi | g', 'magnesio | g', 'manganese | g', 'potassio | g',
       'proteine | g', 'rame | g', 'sale | g', 'selenio | g', 'sodio | g',
       'valore calorico | kcal', 'vitamina a | g', 'vitamina b1 | g',
       'vitamina b11 | g', 'vitamina b12 | g', 'vitamina b2 | g',
       'vitamina b3 | g', 'vitamina b5 | g', 'vitamina b6 | g',
       'vitamina c | g', 'vitamina d | g', 'vitamina e | g', 'vitamina k | g',
       'zinco | g', 'zuccheri | g', 'category'],
      dtype='object')

### Converting values to kcal, grams and floats

In [None]:
for c in tqdm(yazio.columns[:-1]):
    for i in yazio.index:
        v = yazio[c][i]
        if pd.notna(v) and isinstance(v, str):
            v = v.replace(".", "").replace(",", ".")
            if "kcal" in v:
                yazio[c][i] = float(v.split("kcal")[0])
            elif "<" in v and "mg" in v:
                yazio[c][i] = 0.1*1e-3
            elif "<" in v and "g" in v:
                yazio[c][i] = 0.05
            elif "mg" in v:
                yazio[c][i] = float(v.split("mg")[0])*1e-3
            elif "g" in v:
                yazio[c][i] = float(v.split("g")[0])
            else:
                yazio[c][i] = float(v)

HBox(children=(IntProgress(value=0, max=34), HTML(value='')))

### Dropping food items with nan values in required columns

In [None]:
yazio = yazio.drop(yazio.index[np.any(
    pd.isna(yazio[[
        'acqua | g', 'carboidrati | g', 'grassi | g', 'proteine | g',
        'valore calorico | kcal'
    ]]),
    axis=1)])

### Dropping food items with carbohidrates lower than sugars

In [None]:
mask = yazio['carboidrati | g']<yazio['zuccheri | g']
yazio = yazio.drop(yazio.index[mask])

### Dropping column 'sale | g' as it contains nearly only nan values

In [None]:
yazio = yazio.drop(columns=['sale | g'])

### Dropping 0 rows

In [None]:
yazio = yazio.drop(yazio.index[np.sum(yazio == 0, axis=1) == len(yazio.columns)-1])

### Normalizing dataset to range '100g'

In [None]:
norm_columns = list(set(yazio.columns)-set(['grassi monoinsaturi | g', 'grassi polinsaturi | g',
       'grassi saturi | g', 'valore calorico | kcal', 'zuccheri | g', 'category']))
yazio[yazio.columns[:-1]] = (yazio[yazio.columns[:-1]].transpose() / np.sum(yazio[norm_columns], axis=1) * 100).transpose()

### Saving dataset

In [None]:
yazio.to_csv("../csv/yazio.csv")