In [1]:
import os
import re
import numpy as np
import pandas as pd
from pprint import pprint
from tqdm import tqdm_notebook as tqdm

### Determining food names and tables

In [2]:
path = "./yazio"
regex = re.compile(r"^([\w-]+)-([\w]+\s?[\w]+)\.csv$")
foods = list({re.findall(regex, file)[0][0] for file in next(os.walk(path))[2] if file.endswith(".csv")})
tables = list({re.findall(regex, file)[0][1] for file in next(os.walk(path))[2] if file.endswith(".csv")})

### Merging csv files

In [3]:
yazio = pd.DataFrame([])
yazio.index.name="name"
for path_name in tqdm(list(foods)):
    name = path_name.replace("-", " ")
    df = pd.DataFrame([], columns=["per porzione"])
    for table in tables:
        file_path = "{path}/{name}-{table}.csv".format(path=path, name=path_name, table=table)
        if os.path.exists(file_path):
            df = df.append(pd.read_csv(file_path, usecols=["per porzione", table], index_col=table))
    df = df.transpose()
    df.index = [name]
    df.index.name = yazio.index.name
    yazio = yazio.append(df)

HBox(children=(IntProgress(value=0, max=2758), HTML(value='')))




### Adding units to columns

In [4]:
units = [[re.findall("[a-z]+", v)[0].replace("mg", "g") for v in yazio[c].values if pd.notna(v)][0] for c in yazio.columns]

In [5]:
new_columns = ["{c} | {unit}".format(c=c, unit=u).lower() for c, u in zip(*[yazio.columns, units])]

In [6]:
yazio.columns = new_columns

### Converting values to kcal, grams and floats

In [7]:
for c in yazio.columns:
    for i in yazio.index:
        v = yazio[c][i]
        if pd.notna(v) and isinstance(v, str):
            v = v.replace(".", "").replace(",", ".")
            if "kcal" in v:
                yazio[c][i] = float(v.split("kcal")[0])
            elif "<" in v and "mg" in v:
                yazio[c][i] = 0.1*1e-3
            elif "<" in v and "g" in v:
                yazio[c][i] = 0.05
            elif "mg" in v:
                yazio[c][i] = float(v.split("mg")[0])*1e-3
            elif "g" in v:
                yazio[c][i] = float(v.split("g")[0])
            else:
                yazio[c][i] = float(v)

### Dropping food items with nan values in required columns

In [8]:
yazio = yazio.drop(yazio.index[np.any(
    pd.isna(yazio[[
        'acqua | g', 'carboidrati | g', 'grassi | g', 'proteine | g',
        'valore calorico | kcal'
    ]]),
    axis=1)])

### Dropping food items with carbohidrates lower than sugars

In [9]:
mask = yazio['carboidrati | g']<yazio['zuccheri | g']
yazio = yazio.drop(yazio.index[mask])

### Dropping column 'sale | g' as it contains nearly only nan values

In [10]:
yazio = yazio.drop(columns=['sale | g'])

### Dropping 0 rows

In [11]:
yazio = yazio.drop(yazio.index[np.mean(yazio == 0, axis=1) == 1])

### Normalizing dataset to range '100g'

In [12]:
norm_columns = list(set(yazio.columns)-set(['grassi monoinsaturi | g', 'grassi polinsaturi | g',
       'grassi saturi | g', 'valore calorico | kcal', 'zuccheri | g']))
yazio = (yazio.transpose() / np.sum(yazio[norm_columns], axis=1) * 100).transpose()

### Saving dataset

In [13]:
yazio.to_csv("csv/yazio.csv")