In [1]:
import os
import re
import numpy as np
import pandas as pd
import json
from pprint import pprint
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook as tqdm
from IPython.display import display

### Determining food names and tables

In [2]:
root = "../raw/nut.entecra.it/"
foods = [food for food in os.listdir(root) if food[0] != "."]
tables = [
    table.split(".csv")[0]
    for table in os.listdir("{root}/{food}".format(root=root, food=foods[0]))
    if table.endswith(".csv")
]

### Merging csv files

In [3]:
crea = pd.concat([pd.concat([pd.read_csv("{path}/{table}.csv".format(
            path="{root}/{food}".format(root=root, food=food), table=table), usecols=["0", "1"], index_col="0") for table in tables]).transpose() for food in tqdm(foods)])
crea.index = foods
crea.index.name = "name"

HBox(children=(IntProgress(value=0, max=790), HTML(value='')))




In [4]:
crea = crea.drop(columns=[c for c in crea.columns if any([s in c for s in ["(%)", "÷"]])])

### Renaming columns

In [5]:
crea.columns = [re.sub("\(([\w\W]+)\)", r" | \1", c.lower().strip(":")).replace("  ", " ") for c in crea.columns]


### Dropping near entirely NaN columns

In [6]:
print("Dropping {columns}".format(columns = crea.columns[np.mean(pd.notna(crea))<0.1]))
crea = crea.drop(columns=crea.columns[np.mean(pd.notna(crea))<0.1])

Dropping Index(['note'], dtype='object')


### Adding missing units

In [7]:
url = "http://nut.entecra.it/646/tabelle_di_composizione_degli_alimenti.html?idalimento=104020&quant=100"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html5lib")
mg_columns = [
    c.lower().strip(":") for c in pd.read_html(str(soup))[3][0][1:]
    if "%" not in c and c.lower().strip(":") not in ["aminoacido limitante", "indice chimico"]
]
g_columns = [
    c.lower().strip(":") for c in pd.read_html(str(soup))[4][0][1:]
    if all([s not in c for s in ["%", "/"]])
]

In [8]:
crea.columns = [
    "{c} | g".format(c=c) if c in g_columns else "{c} | mg".format(c=c)
    if c in mg_columns else c for c in crea.columns
]

In [9]:
crea["colesterolo | mg"]["cervello di bovino"] = 3100

In [10]:
factors = {
    "mg":1e-3,
    "mcg":1e-6,
    "µg":1e-6
}

string_columns = ["aminoacido limitante", "categoria", "nome scientifico"]
for c in tqdm(crea.columns):
    if c in string_columns:
        continue
    for i in crea.index:
        value = crea[c][i]
        if pd.notna(value):
            if value=="tr":
                value = 0.05
            elif isinstance(value, str) and ">" in value:
                value = float(value.replace(">", ""))
            else:
                value = float(value)
            for unit, factor in factors.items():
                if unit in c:
                    value*=factor
                    break
            crea[c][i] = value

HBox(children=(IntProgress(value=0, max=69), HTML(value='')))




In [11]:
crea = crea.rename(columns={
    c:c.replace(f, "g") for c in crea.columns for f in factors if f in c
})

In [13]:
crea.to_csv("../csv/crea.csv")