In [1]:
import os
import re
import numpy as np
import pandas as pd
import json
from pprint import pprint
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook as tqdm
from IPython.display import display

### Determining food names and tables

In [2]:
root = "../raw/nut.entecra.it/"
foods = [food for food in os.listdir(root) if food[0] != "."]
tables = [
    table.split(".csv")[0]
    for table in os.listdir("{root}/{food}".format(root=root, food=foods[0]))
    if table.endswith(".csv")
]

### Merging csv files

In [3]:
crea = pd.concat([pd.concat([pd.read_csv("{path}/{table}.csv".format(
            path="{root}/{food}".format(root=root, food=food), table=table), usecols=["0", "1"], index_col="0") for table in tables]).transpose() for food in tqdm(foods)])
crea.index = foods
crea.index.name = "name"

HBox(children=(IntProgress(value=0, max=790), HTML(value='')))




In [4]:
crea = crea.drop(columns=[c for c in crea.columns if any([s in c for s in ["(%): ", "÷"]])])

### Renaming columns

In [5]:
crea.columns = [re.sub("\(([\w\W]+)\)", r" | \1", c.lower().strip(":")).replace("  ", " ") for c in crea.columns]


### Dropping near entirely NaN columns

In [6]:
print("Dropping {columns}".format(columns = crea.columns[np.mean(pd.notna(crea))<0.1]))
crea = crea.drop(columns=crea.columns[np.mean(pd.notna(crea))<0.1])

Dropping Index(['lipidi totali | %', 'note', 'proteine | %'], dtype='object')


### Adding missing units

In [7]:
path = "./example_crea_page.html"
if not os.path.exists(path):
    url = "http://nut.entecra.it/646/tabelle_di_composizione_degli_alimenti.html?idalimento=104020&quant=100"
    response = requests.get(url)
    with open(path, "w") as f:
        f.write(response.text)
with open(path, "r") as f:
    page = f.read()

In [8]:
soup = BeautifulSoup(page, "html5lib")
mg_columns = [
    c.lower().strip(":") for c in pd.read_html(str(soup))[3][0][1:]
    if "%" not in c and c.lower().strip(":") not in ["aminoacido limitante", "indice chimico"]
]
g_columns = [
    c.lower().strip(":") for c in pd.read_html(str(soup))[4][0][1:]
    if all([s not in c for s in ["%", "/"]])
]

In [9]:
crea.columns = [
    "{c} | g".format(c=c) if c in g_columns else "{c} | mg".format(c=c)
    if c in mg_columns else c for c in crea.columns
]

In [10]:
crea = crea.drop(columns=["energia | kj"])

In [11]:
crea["colesterolo | mg"]["cervello di bovino"] = 3100

In [12]:
factors = {
    "mg":1e-3,
    "mcg":1e-6,
    "µg":1e-6
}

string_columns = ["aminoacido limitante", "categoria", "nome scientifico", "codice alimento"]
for c in tqdm(crea.columns):
    if c in string_columns:
        continue
    for i in crea.index:
        value = crea[c][i]
        if pd.notna(value):
            if value=="tr":
                value = 0.05
            elif isinstance(value, str) and ">" in value:
                value = float(value.replace(">", ""))
            else:
                value = float(value)
            for unit, factor in factors.items():
                if unit in c:
                    value*=factor
                    break
            crea[c][i] = value

HBox(children=(IntProgress(value=0, max=72), HTML(value='')))




In [13]:
crea = crea.rename(columns={
    c:c.replace(f, "g") for c in crea.columns for f in factors if f in c
})

### Dropping rows without most important columns

In [14]:
required_columns = [
    'acqua | g', 'proteine | g', 'lipidi | g', 'carboidrati disponibili | g'
]

In [15]:
crea = crea.loc[crea.index[~np.any(pd.isna(crea[required_columns]), axis=1)]]

### Checking for sum to 100g

In [16]:
floats_columns = crea.columns[~crea.columns.isin(string_columns)]
crea[floats_columns] = crea[floats_columns].astype("float64")

In [17]:
grams = [
    # Principali
    'acqua | g',
    'proteine | g',
    'lipidi | g',
    'colesterolo | g',
    'carboidrati disponibili | g',
    'fibra totale | g',
    'alcol | g',
    # Minerali
    'sodio | g',
    'potassio | g',
    'ferro | g',
    'calcio | g',
    'fosforo | g',
    'magnesio | g',
    'zinco | g',
    'rame | g',
    'selenio | g',
    # Amminoacidi e vitamine
    'tiamina | g',
    'riboflavina | g',
    'niacina | g',
    'vitamina a retinolo eq. | g',
    'vitamina c | g',
    'vitamina e | g',
    'acido fitico | g',
]

In [18]:
to_normalize = list(set(crea.columns) - set(string_columns) - set([c for c in crea.columns if "%" in c]))


In [19]:
def _and(*args):
    return np.all(args, axis=0)

def _or(*args):
    return np.any(args, axis=0)

In [20]:
with open("./sanitization_parameters.json", "r") as f:
    window = json.load(f)["grams_maximal_window"]

In [21]:
np.mean(pd.isna(crea.values))

0.4868782299741602

In [22]:
sums = np.sum(crea[grams], axis=1)
around_100g = crea.iloc[_and(sums<100+window, sums>100-window)].copy()
around_100g[to_normalize] = around_100g[to_normalize].divide(np.sum(around_100g[grams], axis=1), axis="index")*100
crea = around_100g

In [23]:
np.mean(pd.isna(crea.values))

0.471917744050965

In [24]:
crea.to_csv("../csv/crea.csv")