Imports

In [1]:
from bs4 import BeautifulSoup
from requests import Session
from tqdm import tqdm
import pandas as pd

In [4]:
import settings
BASE_URL = "https://www.ventos.com/en/products/product-catalogue"

In [5]:
def get_page(count: int, offset: int = 0):
    resp = sess.get(f"{BASE_URL}", params={"limit1": count, "limitstart1": offset})
    print(resp.request.url)
    print(resp.status_code)
    resp.raise_for_status()
    return resp

In [6]:
def get_detail_page(url: str)->dict:
    chem = {}
    resp = sess.get(url)
    if resp.ok:
        soup = BeautifulSoup(resp.content, features="html.parser")
        info_body = soup.find("div", class_="evsa-textes")
        tech = info_body.find("div", class_="evsa-textes-specs")
        specs = info_body.find("div", class_="evsa-textes-especificacions")
        
        for row in tech.find_all("div", class_="row"):
            chem[row.find("div", class_="evsa-ls-label").text.strip()] = row.find("div", class_="evsa-ls-field").text.strip()
        for row in specs.find_all("div", class_="row"):
            chem[row.find("div", class_="evsa-ls-label").text.strip()] = row.find("div", class_="evsa-ls-field").text.strip()
    else:
        print(f"Requested URL {url}, response was not OK: {resp.status_code}")
    return chem

Session setup

In [7]:
sess = Session()

Fetch pages

In [None]:
chems = []
offset = 2000
steps = 300
while(True):
    print(f"Requesting {steps} items from catalog for Offset {offset}")
    page = get_page(steps, offset)
    soup = BeautifulSoup(page.content, features="html.parser")
    offset += steps
    chem_list = soup.find_all("li", class_="evsa-ls-element")
    if len(chem_list) < 2:
        break
    for chem_body in chem_list:
        chem = {}
        chem[settings.name] = ""
        if chem_body.find(class_="evsa-ls-name"):
            chem[settings.name] = chem_body.find(class_="evsa-ls-name").a.text # h3 enclosing a/href
        else:
            print("Could not find HTML tag 'evsa-ls-name'")
        
        fields = chem_body.find(class_="evsa-ls-fields")
        if fields:
            for field in fields.find_all(class_="evsa-ls-label"):
                chem[field.text.strip().lower().split(":")[0]] = field.nextSibling.text
        tech_info = chem_body.find("a", string = "TECHNICAL DATA")

        chem[settings.tech] = tech_info.get("href") if tech_info else None
        chem[settings.manufactur_id] = chem[settings.tech].split("/")[-2] if chem[settings.tech] else None
        chem[settings.kosher] = True if chem_body.find("a", string = "KOSHER") else False
        chems.append(chem)

In [None]:
for chem in tqdm(chems):
    chem.update(get_detail_page(chem[settings.tech]))

In [14]:
data_frame = pd.DataFrame(chems)
data_frame.sort_values(by=['manufactur_id'])
data_frame.to_csv('./out/chems.csv', index = False, sep=';')