In [75]:
from tinycrawler import TinyCrawler, Log, Statistics
from bs4 import BeautifulSoup, SoupStrainer
import pandas as pd
import requests
from requests import Response
from urllib.parse import urlparse

import os
import json
import bs4
import pandas as pd


def html_sanitization(html: str) -> str:
    """Return sanitized html."""
    return html.replace("⌊", "")


def get_product_name(response: Response) -> str:
    """Return product name from given Response object."""
    name = bs4.BeautifulSoup(response.text).find("h1").get_text()
    print(name)
    return name


def parse_tables(html: str, path: str, strainer: SoupStrainer):
    """Parse table at given strained html object saving them as csv at given path."""
    for table in BeautifulSoup(html, "lxml", parse_only=strainer)("table"):
        df = pd.read_html(html_sanitization(str(table)))[0]
        table_name = df[0][0].lower().strip().replace(" ","_")
        df = df.drop(0)
        df = df.set_index(0)
        df.to_csv("{path}/{table_name}.csv".format(path=path, table_name=table_name))


def parse(response: Response):
    path = "{root}/{product}".format(
        root=urlparse(response.url).netloc, product=get_product_name(response))
    if not os.path.exists(path):
        os.makedirs(path)
    parse_tables(
        response.text, path,
        SoupStrainer(
            "table"))


In [76]:
url = "https://www.cibo360.it/cgi-bin/db/post_vn1.cgi?ID_UTENTE=&CODE=000090"
parse(requests.get(url))

 Orzo perlato (Hordeum vulgare)


In [77]:
text = requests.get("https://www.cibo360.it/cgi-bin/db/datafind1.cgi").text

In [80]:
bs4.BeautifulSoup(text)("a")

[<a href="/cgi-bin/db/post_vn1.cgi?ID_UTENTE=&amp;CODE=407010">Liquori da dessert</a>,
 <a href="/cgi-bin/db/post_vn1.cgi?ID_UTENTE=&amp;CODE=009110">Margarina -2/3 di grassi animali, 1/3 di grassi vegetali</a>,
 <a href="/cgi-bin/db/post_vn1.cgi?ID_UTENTE=&amp;CODE=106270">Pollo, ala con pelle cruda</a>,
 <a href="/cgi-bin/db/post_vn1.cgi?ID_UTENTE=&amp;CODE=005506">Melanzane, cotte [saltate in padella senza aggiunta di grassi e di sale]</a>,
 <a href="/cgi-bin/db/post_vn1.cgi?ID_UTENTE=&amp;CODE=122410">Salmone, affumicato</a>,
 <a href="/cgi-bin/db/post_vn1.cgi?ID_UTENTE=&amp;CODE=106913">Tacchino, fuso, senza pelle, cotto [in forno senza aggiunta di grassi e di sale e scolato dal grasso prodotto con la cottura]</a>,
 <a href="/cgi-bin/db/post_vn1.cgi?ID_UTENTE=&amp;CODE=005600">Peperoni crudi</a>,
 <a href="/cgi-bin/db/post_vn1.cgi?ID_UTENTE=&amp;CODE=208520">Crostata con marmellata di albicocche, tipo industriale</a>,
 <a href="/cgi-bin/db/post_vn1.cgi?ID_UTENTE=&amp;CODE=119010">

In [79]:
text

'<html>\n\t\t\t<head>\n\t\t\t<title>Cibo 360 - Database alimenti - Ricerca per alimento</title>\n\n\t\t\t<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">\n\t\t\t<link rel="stylesheet" href="/css/common.css" type="text/css">\n\t\t\t<link rel="stylesheet" href="/css/calcolo/calcolo.css" type="text/css">\t\t\t\n\t\t\t<link rel="stylesheet" href="/css/header.css" type="text/css">\n\t\t\t<link rel="stylesheet" href="/css/calcolo/database.css" type="text/css">\n\t\t\t<script language="JavaScript">\n\t\t\t<!--\n\t\t\tfunction MM_openBrWindow(theURL,winName,features) { //v2.0\n\t\t\t  window.open(theURL,winName,features);\n\t\t\t}\n\t\t\t//-->\n\t\t\t</script>\n\t\t</head>\n\t<BODY bgColor=#ffffff leftMargin=0 topMargin=0 MARGINWIDTH="0" MARGINHEIGHT="0" >\n\n\t<MAP name=menu_sec>\n\t  <AREA shape=RECT alt="Contacalorie" coords=4,45,95,67\n\thref="/cgi-bin/login1.cgi?LOGIN=CONTA&ID_UTENTE=">\n\t  <AREA shape=RECT alt="Calcola Ricetta" coords=120,43,182,66\n\thref="/cgi-