In [1]:
import collections
from bs4 import BeautifulSoup as bs
import os
import pandas as pd
import re
import requests
import time
import xml.etree.ElementTree

st_accept = "text/html"
st_useragent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 12_3_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Safari/605.1.15"
headers = {
   "Accept": st_accept,
   "User-Agent": st_useragent
}

NIIKM_BASE_LINK = "https://www.niikm.ru/"
NIIKM_LINKS = ["products/azot/", "products/argon/", "products/acetylene/", "products/hydrogen/", "products/helium/",
               "products/carbon_dioxide/", "products/oxygen/", "products/krypton/", "products/xenon/",
               "products/methane/", "products/neon/", "products/spbt/"]
PANDAS_CRUTCH_BASE = "/content/temp"
REQUEST_DELAY = 3

In [None]:
def parse_url(url, recursive_call):
    print("Parsing html from \"" + url + "\".")

    req = requests.get(url, headers)

    if req.status_code != 200:
        print("    ERROR: The request status code is " + str(req.status_code) + ".")
        return

    soup = bs(req.text, 'lxml')
    #processed_tags = set()

    #A crutch for Pandas library.
    PANDAS_CRUTCH = PANDAS_CRUTCH_BASE + str(id(url)) + ".txt"
    temp_file = open(PANDAS_CRUTCH, 'w', encoding="utf-8")
    temp_file.write(req.text)
    temp_file.close()
    tables = pd.read_html(PANDAS_CRUTCH)
    table_index = 0

    page_title = soup.find("div", attrs={"class":"title-page"})
    if page_title == None:
        print("    ERROR: Could not find page title.")
        return

    f = open(page_title.text.lstrip().rstrip().replace("\n","") + ".txt", 'w', encoding="utf-8")
    f.write(page_title.text.lstrip().rstrip().replace("\n","") + "\n")

    #Required for distinguishing articles and product pages.
    is_product_page = True
    product_description = soup.find("div", attrs={"class":"product-card-dbl__col product-card-dbl__description-product"})
    product_card = soup.find("div", attrs={"class":"product-card"})
    if product_description == None and product_card == None:
        is_product_page = False

    if is_product_page:
        print("    INFO: The page is a product page.")

        if product_description == None:
            print("    INFO: Could not find the product description.")
        else:
            print("    Parsing the product description...")
            for i in product_description.children:
                if i.name == "p":
                    f.write(i.text.lstrip().rstrip().replace("Мы предлагаем:","") + "\n")
            print("        Done.")

        if product_card == None:
            print("    INFO: Could not find the product card.")
        else:
            print("    Parsing the product card...")
            buttons = product_card.find_all("button")
            properties_tab_index = None
            for i in range(len(buttons)):
                if buttons[i].text.find("свойства") != -1:
                    properties_tab_index = i
                    break

            if properties_tab_index == None:
                print("        INFO: Could not find the tab with the properties.")
            else:
                #And another crutch. For some reason find_all() sometimes returns 
                #the parent tag as a part of the result set.
                if recursive_call:
                    properties_tab_index += 1

                properties_tab = soup.find("div", attrs={"class":"product-card__tabs-content-wrap"}).find_all("div")[properties_tab_index]
                f.write("Основные свойства:\n")
                for i in properties_tab.children:
                    if i.name == "ul":
                        for j in i.children:
                            if j.name == "li":
                                spans = j.find_all("span")
                                f.write(spans[0].text + ": " + spans[1].text + "\n")
                    elif i.name == "p":
                        f.write(i.text + ":\n")
            print("        Done.")

        product_assortment = soup.find("div", attrs={"class":"product-card-dbl__product-range-wrap"})
        if product_assortment == None:
            print("    INFO: The page does not contain references to the product subtypes.")
        else:
            print("    Starting the recursive traversal of the product subtypes...\n")
            for i in product_assortment:
                time.sleep(REQUEST_DELAY)

                if i.name == "a":
                    if i["href"][0] == '/':
                        parse_url(NIIKM_BASE_LINK[:-1] + i["href"], True)
                    else:
                        parse_url(NIIKM_BASE_LINK + i["href"], True)
            print("\n        Done.")
    else:
        #TODO: add code for articles.
        print("    INFO: The page is an article page. Skipping.")
        pass

    f.close()
    os.remove(PANDAS_CRUTCH)
    print("    Done.")

for i in NIIKM_LINKS:
    parse_url(NIIKM_BASE_LINK + i, False)
    time.sleep(REQUEST_DELAY)
