In [10]:
import requests
import lxml
import json
import re
import time
from bs4 import BeautifulSoup

In [11]:
base_url = 'https://shop.relod.ru/catalog-products/4577/?sort=PROPERTY_RATING&order=desc'

In [12]:
def log_progress(sequence, every=10): ## функция для логирования прогресса
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    progress = IntProgress(min=0, max=len(sequence), value=0)
    
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)
    
    for index, record in enumerate(sequence):
        if index % every == 0:
            progress.value = index + 1
        label.value = '{name}: {index} / {len_}'.format(
                        name='Page',
                        index=index + 1,
                        len_ = len(sequence)
                    )
        yield record

In [13]:
class BadPageError(Exception):
    def __init__(self, message, errors):
        super().__init__(message)
        self.errors = errors
        print('Printing Errors:')
        print(errors)

In [14]:
def get_page(url, n_attempts=5, t_sleep=1):
    for _ in range(n_attempts):
        try:
            page = requests.get(url)
        except requests.exceptions.RequestException as e:
            print('Error: ', e, file=sys.stderr)
        else:
            if page.status_code == 200:
                return page
        time.sleep(t_sleep)
    return None

## 1. Сохранение ссылок

Простой способ не работает, т.к даже в рамках одной обкачки книги могут прыгать между страницами.

In [15]:
def urls_getter(url): ##Функция для скачивания ссылок (простой способ - не работает)
    urls_list = []
    pages = []
    for i in log_progress(range(250), 1):
        page = get_page(url + f'&PAGEN_1={i + 1}')
        if page is None:
            return None
        
        soup = BeautifulSoup(page.content, 'html.parser')
        desc = soup.find('div', class_= 'catalog-section bx-blue').find('div', 
                                                                class_ = 'row').find_all('a', 
                                                                                         class_ = 'bxr-item-image-wrap')
        urls_list += list(map(lambda x: 'https://shop.relod.ru' + x.attrs['href'], desc))
    return urls_list

In [16]:
def urls_getter_fast(url): ##Функция для скачивания ссылок (быстрый способ)
    urls_list = []
    for i in log_progress(range(250), 1):
        data = {'siteId': 's1', 'action': 'showMore', 'PAGEN_1':i + 1,
                'template': '.default.5066805247fd90f4681c9458759019c47aeeee73c2035d99874d41cd9e7eb382',
                'parameters': 'YToxNzM6e3M6MTg6IkNPTVBPTkVOVF9URU1QTEFURSI7czo4OiIuZGVmYXVsdCI7czoxMToiSUJMT0NLX1RZUEUiO3M6NzoiY2F0YWxvZyI7czo5OiJJQkxPQ0tfSUQiO3M6MToiMiI7czoxMDoiU0VDVElPTl9JRCI7czo0OiI0NTc3IjtzOjEyOiJTRUNUSU9OX0NPREUiO047czoxMToiRklMVEVSX05BTUUiO3M6MTY6IkNBVEFMT0dfUFJPRFVDVFMiO3M6MTk6IklOQ0xVREVfU1VCU0VDVElPTlMiO3M6MToiQSI7czoxOToiU0hPV19BTExfV09fU0VDVElPTiI7czoxOiJOIjtzOjEzOiJDVVNUT01fRklMVEVSIjtzOjA6IiI7czoxODoiSElERV9OT1RfQVZBSUxBQkxFIjtzOjE6Ik4iO3M6MjU6IkhJREVfTk9UX0FWQUlMQUJMRV9PRkZFUlMiO3M6MToiWSI7czoxODoiRUxFTUVOVF9TT1JUX0ZJRUxEIjtzOjE1OiJQUk9QRVJUWV9SQVRJTkciO3M6MTg6IkVMRU1FTlRfU09SVF9PUkRFUiI7czo0OiJkZXNjIjtzOjE5OiJFTEVNRU5UX1NPUlRfRklFTEQyIjtzOjQ6Im5hbWUiO3M6MTk6IkVMRU1FTlRfU09SVF9PUkRFUjIiO3M6NDoiZGVzYyI7czoxNzoiT0ZGRVJTX1NPUlRfRklFTEQiO3M6MjoiaWQiO3M6MTc6Ik9GRkVSU19TT1JUX09SREVSIjtzOjM6ImFzYyI7czoxODoiT0ZGRVJTX1NPUlRfRklFTEQyIjtzOjU6InNob3dzIjtzOjE4OiJPRkZFUlNfU09SVF9PUkRFUjIiO3M6MzoiYXNjIjtzOjE4OiJQQUdFX0VMRU1FTlRfQ09VTlQiO3M6MjoiMjAiO3M6MTg6IkxJTkVfRUxFTUVOVF9DT1VOVCI7czoxOiI0IjtzOjIwOiJQUk9QRVJUWV9DT0RFX01PQklMRSI7YToyOntpOjA7czo4OiJBVFJfSVNCTiI7aToxO3M6MTM6IkFUUl9QVUJMSVNIRVIiO31zOjE3OiJPRkZFUlNfRklFTERfQ09ERSI7YTo2OntpOjA7czo0OiJOQU1FIjtpOjE7czoxMjoiUFJFVklFV19URVhUIjtpOjI7czoxNToiUFJFVklFV19QSUNUVVJFIjtpOjM7czoxMToiREVUQUlMX1RFWFQiO2k6NDtzOjE0OiJERVRBSUxfUElDVFVSRSI7aTo1O3M6MDoiIjt9czoyMDoiT0ZGRVJTX1BST1BFUlRZX0NPREUiO2E6Mjp7aTowO3M6NjoiTk9USUNFIjtpOjE7czowOiIiO31zOjEyOiJPRkZFUlNfTElNSVQiO3M6MToiMCI7czoyMDoiUFJPRFVDVF9ESVNQTEFZX01PREUiO3M6MToiWSI7czoxMzoiQUREX1BJQ1RfUFJPUCI7czoxNzoiQVRSX0VYVFJBX1BJQ1RVUkUiO3M6MTk6Ik9GRkVSX0FERF9QSUNUX1BST1AiO3M6MToiLSI7czoxNjoiT0ZGRVJfVFJFRV9QUk9QUyI7YToxOntpOjA7czo2OiJOT1RJQ0UiO31zOjIwOiJQUk9EVUNUX1NVQlNDUklQVElPTiI7czoxOiJZIjtzOjIxOiJTSE9XX0RJU0NPVU5UX1BFUkNFTlQiO3M6MToiWSI7czoxNDoiU0hPV19PTERfUFJJQ0UiO3M6MToiWSI7czoxNzoiU0hPV19NQVhfUVVBTlRJVFkiO3M6MToiQSI7czoyMToiQlhSX1NIT1dfTUFYX1FVQU5USVRZIjtzOjE6IkEiO3M6MTY6IlNIT1dfQ0xPU0VfUE9QVVAiO3M6MToiWSI7czoxMjoiTUVTU19CVE5fQlVZIjtzOjEyOiLQmtGD0L/QuNGC0YwiO3M6MjI6Ik1FU1NfQlROX0FERF9UT19CQVNLRVQiO3M6MTc6ItCSINC60L7RgNC30LjQvdGDIjtzOjE4OiJNRVNTX0JUTl9TVUJTQ1JJQkUiO3M6NDQ6ItCj0LLQtdC00L7QvNC40YLRjCDQviDQv9C+0YHRgtGD0L/Qu9C10L3QuNC4IjtzOjE1OiJNRVNTX0JUTl9ERVRBSUwiO3M6MzQ6ItCf0L7QtNGA0L7QsdC90LXQtSDQviDRgtC+0LLQsNGA0LUiO3M6MTg6Ik1FU1NfTk9UX0FWQUlMQUJMRSI7czoyNDoi0J3QtdGCINCyINC90LDQu9C40YfQuNC4IjtzOjExOiJTRUNUSU9OX1VSTCI7czozMToiL2NhdGFsb2ctcHJvZHVjdHMvI1NFQ1RJT05fSUQjLyI7czoxMDoiREVUQUlMX1VSTCI7czozMzoiL2NhdGFsb2ctcHJvZHVjdHMvI0VMRU1FTlRfQ09ERSMvIjtzOjE5OiJTRUNUSU9OX0lEX1ZBUklBQkxFIjtzOjEwOiJTRUNUSU9OX0lEIjtzOjEwOiJDQUNIRV9UWVBFIjtzOjE6IkEiO3M6MTA6IkNBQ0hFX1RJTUUiO3M6ODoiMzYwMDAwMDAiO3M6MTI6IkNBQ0hFX0ZJTFRFUiI7czoxOiJZIjtzOjEyOiJDQUNIRV9HUk9VUFMiO3M6MToiWSI7czo5OiJTRVRfVElUTEUiO3M6MToiWSI7czoxMzoiQlJPV1NFUl9USVRMRSI7czoxOiItIjtzOjEzOiJNRVRBX0tFWVdPUkRTIjtzOjE6Ii0iO3M6MTY6Ik1FVEFfREVTQ1JJUFRJT04iO3M6MToiLSI7czoxNzoiU0VUX0xBU1RfTU9ESUZJRUQiO3M6MToiWSI7czoyNDoiVVNFX01BSU5fRUxFTUVOVF9TRUNUSU9OIjtzOjE6IlkiO3M6MTg6IkFERF9TRUNUSU9OU19DSEFJTiI7czoxOiJOIjtzOjE1OiJBQ1RJT05fVkFSSUFCTEUiO3M6NjoiYWN0aW9uIjtzOjE5OiJQUk9EVUNUX0lEX1ZBUklBQkxFIjtzOjI6ImlkIjtzOjEwOiJQUklDRV9DT0RFIjthOjE6e2k6MDtzOjEwOiJCQVNFX1BSSUNFIjt9czoxNToiVVNFX1BSSUNFX0NPVU5UIjtzOjE6Ik4iO3M6MTY6IlNIT1dfUFJJQ0VfQ09VTlQiO3M6MDoiIjtzOjE3OiJQUklDRV9WQVRfSU5DTFVERSI7czoxOiJZIjtzOjE2OiJDT05WRVJUX0NVUlJFTkNZIjtzOjE6IlkiO3M6MTE6IkNVUlJFTkNZX0lEIjtzOjM6IlJVQiI7czoxMDoiQkFTS0VUX1VSTCI7czoyMDoiL3BlcnNvbmFsL2Jhc2tldC5waHAiO3M6MjA6IlVTRV9QUk9EVUNUX1FVQU5USVRZIjtzOjE6IlkiO3M6MjU6IlBST0RVQ1RfUVVBTlRJVFlfVkFSSUFCTEUiO3M6ODoicXVhbnRpdHkiO3M6MjQ6IkFERF9QUk9QRVJUSUVTX1RPX0JBU0tFVCI7czoxOiJZIjtzOjIyOiJQUk9EVUNUX1BST1BTX1ZBUklBQkxFIjtzOjQ6InByb3AiO3M6MjY6IlBBUlRJQUxfUFJPRFVDVF9QUk9QRVJUSUVTIjtzOjE6IlkiO3M6MTg6IlBST0RVQ1RfUFJPUEVSVElFUyI7YTo0OntpOjA7czoxMDoiQVRSX0FVVEhPUiI7aToxO3M6MTI6IkFUUl9MQU5HVUFHRSI7aToyO3M6NzoiQVRSX0FHRSI7aTozO3M6MTA6IkFUUl9PUklHSU4iO31zOjIyOiJPRkZFUlNfQ0FSVF9QUk9QRVJUSUVTIjthOjE6e2k6MDtzOjY6Ik5PVElDRSI7fXM6MjA6IkFERF9UT19CQVNLRVRfQUNUSU9OIjtOO3M6MTU6IkRJU1BMQVlfQ09NUEFSRSI7czoxOiJOIjtzOjE0OiJQQUdFUl9URU1QTEFURSI7czo4OiIuZGVmYXVsdCI7czoxNzoiRElTUExBWV9UT1BfUEFHRVIiO3M6MToiWSI7czoyMDoiRElTUExBWV9CT1RUT01fUEFHRVIiO3M6MToiWSI7czoxMToiUEFHRVJfVElUTEUiO3M6MTI6ItCi0L7QstCw0YDRiyI7czoxNzoiUEFHRVJfU0hPV19BTFdBWVMiO3M6MToiTiI7czoyMDoiUEFHRVJfREVTQ19OVU1CRVJJTkciO3M6MToiTiI7czozMToiUEFHRVJfREVTQ19OVU1CRVJJTkdfQ0FDSEVfVElNRSI7czo1OiIzNjAwMCI7czoxNDoiUEFHRVJfU0hPV19BTEwiO3M6MToiTiI7czoyMjoiUEFHRVJfQkFTRV9MSU5LX0VOQUJMRSI7czoxOiJOIjtzOjE1OiJQQUdFUl9CQVNFX0xJTksiO047czoxNzoiUEFHRVJfUEFSQU1TX05BTUUiO047czo5OiJMQVpZX0xPQUQiO3M6MToiWSI7czoxODoiTUVTU19CVE5fTEFaWV9MT0FEIjtzOjIzOiLQn9C+0LrQsNC30LDRgtGMINC10YnRkSI7czoxNDoiTE9BRF9PTl9TQ1JPTEwiO3M6MToiTiI7czoxNDoiU0VUX1NUQVRVU180MDQiO3M6MToiWSI7czo4OiJTSE9XXzQwNCI7czoxOiJZIjtzOjg6IkZJTEVfNDA0IjtzOjg6Ii80MDQucGhwIjtzOjExOiJNRVNTQUdFXzQwNCI7czowOiIiO3M6MTU6IkNPTVBBVElCTEVfTU9ERSI7czoxOiJZIjtzOjI4OiJESVNBQkxFX0lOSVRfSlNfSU5fQ09NUE9ORU5UIjtzOjE6Ik4iO3M6MjI6IlVTRV9FTkhBTkNFRF9FQ09NTUVSQ0UiO3M6MToiWSI7czoxMjoiRU5MQVJHRV9QUk9QIjtzOjA6IiI7czoxMjoiQ09NUEFSRV9QQVRIIjtzOjUwOiIvY2F0YWxvZy1wcm9kdWN0cy9jb21wYXJlLnBocD9hY3Rpb249I0FDVElPTl9DT0RFIyI7czoxNjoiTUVTU19CVE5fQ09NUEFSRSI7czoxNjoi0KHRgNCw0LLQvdC40YLRjCI7czoxMjoiQ09NUEFSRV9OQU1FIjtzOjIwOiJDQVRBTE9HX0NPTVBBUkVfTElTVCI7czoyMjoiTUVTU19TSE9XX01BWF9RVUFOVElUWSI7czoxNDoi0J3QsNC70LjRh9C40LUiO3M6MjQ6IlJFTEFUSVZFX1FVQU5USVRZX0ZBQ1RPUiI7czoyOiIxMCI7czoyNzoiTUVTU19SRUxBVElWRV9RVUFOVElUWV9NQU5ZIjtzOjEwOiLQvNC90L7Qs9C+IjtzOjI2OiJNRVNTX1JFTEFUSVZFX1FVQU5USVRZX0ZFVyI7czo4OiLQvNCw0LvQviI7czoxNzoiUVVBTlRJVFlfSU5fU1RPQ0siO3M6MTc6ItCSINC90LDQu9C40YfQuNC4IjtzOjE4OiJRVUFOVElUWV9PVVRfU1RPQ0siO3M6MTc6ItCf0L7QtCDQt9Cw0LrQsNC3IjtzOjI0OiJISURFX1NFQ1RJT05fREVTQ1JJUFRJT04iO3M6MToiWSI7czoxMjoiVEhJU19VTklDX0lEIjtzOjk6IjFfc2VjdGlvbiI7czo2OiJSRUdJT04iO3M6MDoiIjtzOjIwOiJCWFJfQUpBWF9SRUdJT05fSU5GTyI7czowOiIiO3M6Mjg6IkJYUkVBRFlfRUxFTUVOVF9BRERDTEFTU19CSUciO3M6MDoiIjtzOjMwOiJCWFJFQURZX0VMRU1FTlRfQUREQ0xBU1NfU01BTEwiO3M6MDoiIjtzOjMzOiJCWFJFQURZX0VMRU1FTlRfQUREQ0xBU1NfU1RBTkRBUlQiO3M6MDoiIjtzOjI0OiJCWFJFQURZX0VMRU1FTlRfRFJBV19CSUciO3M6MTk6ImVjb21tZXJjZS5tMi5iaWcudjEiO3M6MjY6IkJYUkVBRFlfRUxFTUVOVF9EUkFXX1NNQUxMIjtzOjIxOiJlY29tbWVyY2UubTIuc21hbGwudjEiO3M6Mjk6IkJYUkVBRFlfRUxFTUVOVF9EUkFXX1NUQU5EQVJUIjtzOjE1OiJlY29tbWVyY2UubTIudjEiO3M6MzA6IkJYUkVBRFlfRUxFTUVOVF9FWFRfUEFSQU1TX0JJRyI7czoxMjoiYXJyRXh0UGFyYW1zIjtzOjMyOiJCWFJFQURZX0VMRU1FTlRfRVhUX1BBUkFNU19TTUFMTCI7czoxMjoiYXJyRXh0UGFyYW1zIjtzOjM1OiJCWFJFQURZX0VMRU1FTlRfRVhUX1BBUkFNU19TVEFOREFSVCI7czoxMjoiYXJyRXh0UGFyYW1zIjtzOjI4OiJCWFJFQURZX0xJU1RfTUFSS0VSX1RZUEVfQklHIjtzOjE1OiJyaWJib24udmVydGljYWwiO3M6MzA6IkJYUkVBRFlfTElTVF9NQVJLRVJfVFlQRV9TTUFMTCI7czozOiJub3QiO3M6MzM6IkJYUkVBRFlfTElTVF9NQVJLRVJfVFlQRV9TVEFOREFSVCI7czoxNToicmliYm9uLnZlcnRpY2FsIjtzOjMzOiJCWFJFQURZX0xJU1RfT1dOX01BUktFUl9VU0VfU01BTEwiO3M6MToiTiI7czozNjoiQlhSRUFEWV9MSVNUX09XTl9NQVJLRVJfVVNFX1NUQU5EQVJUIjtzOjE6Ik4iO3M6MjI6IkJYUkVBRFlfVVNFUl9UWVBFU19CSUciO3M6MToiTiI7czoyNDoiQlhSRUFEWV9VU0VSX1RZUEVTX1NNQUxMIjtzOjE6Ik4iO3M6Mjc6IkJYUkVBRFlfVVNFUl9UWVBFU19TVEFOREFSVCI7czoxOiJOIjtzOjI4OiJCWFJFQURZX1VTRV9FTEVNRU5UQ0xBU1NfQklHIjtzOjE6IlkiO3M6MzA6IkJYUkVBRFlfVVNFX0VMRU1FTlRDTEFTU19TTUFMTCI7czoxOiJZIjtzOjMzOiJCWFJFQURZX1VTRV9FTEVNRU5UQ0xBU1NfU1RBTkRBUlQiO3M6MToiWSI7czoyNjoiQlhSRUFEWV9WRVJUSUNBTF9BTElHTl9CSUciO3M6MToiWSI7czoyODoiQlhSRUFEWV9WRVJUSUNBTF9BTElHTl9TTUFMTCI7czoxOiJZIjtzOjMxOiJCWFJFQURZX1ZFUlRJQ0FMX0FMSUdOX1NUQU5EQVJUIjtzOjE6IlkiO3M6MjI6IkJYUl9JTUdfTUFYX0hFSUdIVF9CSUciO3M6MzoiMTgwIjtzOjI0OiJCWFJfSU1HX01BWF9IRUlHSFRfU01BTEwiO3M6MjoiOTAiO3M6Mjc6IkJYUl9JTUdfTUFYX0hFSUdIVF9TVEFOREFSVCI7czozOiIxODAiO3M6MjE6IkJYUl9JTUdfTUFYX1dJRFRIX0JJRyI7czozOiIxODAiO3M6MjM6IkJYUl9JTUdfTUFYX1dJRFRIX1NNQUxMIjtzOjI6IjkwIjtzOjI2OiJCWFJfSU1HX01BWF9XSURUSF9TVEFOREFSVCI7czozOiIxODAiO3M6Mjg6IkJYUl9QUk9EVUNUX0JMT0NLU19PUkRFUl9CSUciO3M6NTE6InBpY3R1cmUscmF0aW5nLG5hbWUsYXJ0aWNsZSxwcmV2aWV3dGV4dCxhY3Rpb250aW1lciI7czozMzoiQlhSX1BST0RVQ1RfQkxPQ0tTX09SREVSX1NUQU5EQVJUIjtzOjM5OiJwaWN0dXJlLG5hbWUsYWN0aW9udGltZXIsYXJ0aWNsZSxyYXRpbmciO3M6MjU6IkJYUl9TSE9XX0FDVElPTl9USU1FUl9CSUciO3M6MToiTiI7czoyNzoiQlhSX1NIT1dfQUNUSU9OX1RJTUVSX1NNQUxMIjtzOjE6Ik4iO3M6MzA6IkJYUl9TSE9XX0FDVElPTl9USU1FUl9TVEFOREFSVCI7czoxOiJOIjtzOjIwOiJCWFJfU0hPV19BUlRJQ0xFX0JJRyI7czoxOiJZIjtzOjIyOiJCWFJfU0hPV19BUlRJQ0xFX1NNQUxMIjtzOjE6IlkiO3M6MjU6IkJYUl9TSE9XX0FSVElDTEVfU1RBTkRBUlQiO3M6MToiWSI7czoyNToiQlhSX1NIT1dfUFJFVklFV19URVhUX0JJRyI7czoxOiJZIjtzOjE5OiJCWFJfU0hPV19SQVRJTkdfQklHIjtzOjM6ImF2ZyI7czoyMToiQlhSX1NIT1dfUkFUSU5HX1NNQUxMIjtzOjM6ImF2ZyI7czoyNDoiQlhSX1NIT1dfUkFUSU5HX1NUQU5EQVJUIjtzOjM6ImF2ZyI7czoxOToiQlhSX1NIT1dfU0xJREVSX0JJRyI7czoxOiJOIjtzOjIxOiJCWFJfU0hPV19TTElERVJfU01BTEwiO3M6MToiWSI7czoyNDoiQlhSX1NIT1dfU0xJREVSX1NUQU5EQVJUIjtzOjE6Ik4iO3M6Mjc6IkJYUl9TS1VfUFJPUFNfU0hPV19UWVBFX0JJRyI7czo2OiJzcXVhcmUiO3M6Mjk6IkJYUl9TS1VfUFJPUFNfU0hPV19UWVBFX1NNQUxMIjtzOjY6InNxdWFyZSI7czozMjoiQlhSX1NLVV9QUk9QU19TSE9XX1RZUEVfU1RBTkRBUlQiO3M6Njoic3F1YXJlIjtzOjIzOiJCWFJfU0xJREVSX0lOVEVSVkFMX0JJRyI7czo0OiIzMDAwIjtzOjI4OiJCWFJfU0xJREVSX0lOVEVSVkFMX1NUQU5EQVJUIjtzOjQ6IjMwMDAiO3M6Mjg6IkJYUl9TTElERVJfUFJPR1JFU1NfU1RBTkRBUlQiO3M6MToiTiI7czoyODoiQlhSX1RJTEVfU0hPV19QUk9QRVJUSUVTX0JJRyI7czoxOiJZIjtzOjMwOiJCWFJfVElMRV9TSE9XX1BST1BFUlRJRVNfU01BTEwiO3M6MToiTiI7czozMzoiQlhSX1RJTEVfU0hPV19QUk9QRVJUSUVTX1NUQU5EQVJUIjtzOjE6IlkiO3M6MjE6IkJYUl9VU0VfRkFTVF9WSUVXX0JJRyI7czoxOiJOIjtzOjIzOiJCWFJfVVNFX0ZBU1RfVklFV19TTUFMTCI7czoxOiJOIjtzOjI2OiJCWFJfVVNFX0ZBU1RfVklFV19TVEFOREFSVCI7czoxOiJOIjtzOjIyOiJNRVNTX0JUTl9GQVNUX1ZJRVdfQklHIjtzOjMxOiLQkdGL0YHRgtGA0YvQuSDQv9GA0L7RgdC80L7RgtGAIjtzOjI0OiJNRVNTX0JUTl9GQVNUX1ZJRVdfU01BTEwiO3M6MzE6ItCR0YvRgdGC0YDRi9C5INC/0YDQvtGB0LzQvtGC0YAiO3M6Mjc6Ik1FU1NfQlROX0ZBU1RfVklFV19TVEFOREFSVCI7czozMToi0JHRi9GB0YLRgNGL0Lkg0L/RgNC+0YHQvNC+0YLRgCI7czoyMDoiUFJPRFVDVF9ST1dfVkFSSUFOVFMiO3M6MTY2OiJbeydWQVJJQU5UJzonMycsJ0JJR19EQVRBJzpmYWxzZX0seydWQVJJQU5UJzonMycsJ0JJR19EQVRBJzpmYWxzZX0seydWQVJJQU5UJzonMycsJ0JJR19EQVRBJzpmYWxzZX0seydWQVJJQU5UJzonMycsJ0JJR19EQVRBJzpmYWxzZX0seydWQVJJQU5UJzonMycsJ0JJR19EQVRBJzpmYWxzZX1dIjtzOjEzOiJQUk9QRVJUWV9DT0RFIjthOjI6e2k6MDtzOjg6IkFUUl9JU0JOIjtpOjE7czoxMzoiQVRSX1BVQkxJU0hFUiI7fXM6MTc6IkNVUlJFTlRfQkFTRV9QQUdFIjtzOjY1OiIvY2F0YWxvZy1wcm9kdWN0cy80NTc3Lz9zb3J0PVBST1BFUlRZX1JBVElORyZvcmRlcj1kZXNjJlBBR0VOXzE9MiI7czoxMToiUEFSRU5UX05BTUUiO3M6MjM6ImJ4cmVhZHkubWFya2V0MjpjYXRhbG9nIjtzOjIwOiJQQVJFTlRfVEVNUExBVEVfTkFNRSI7czoxMzoicmVsb2QtY2F0YWxvZyI7czoyMDoiUEFSRU5UX1RFTVBMQVRFX1BBR0UiO3M6Nzoic2VjdGlvbiI7czoxMzoiR0xPQkFMX0ZJTFRFUiI7YTowOnt9fQ==.28da89bd5427820782e2cf9d1b054c9dbe204262cb62c52317e723b800c1a37e'
                }
        try:
            page = requests.post('https://shop.relod.ru/bitrix/components/bxready.market2/catalog.section/ajax.php',
                                 data = data)
        except requests.exceptions.RequestException as e:
            print('Error: ', e)
            return None
        soup = BeautifulSoup(page.content, 'html.parser')
        desc = soup.find('div', class_= 'catalog-section bx-blue').find('div', 
                                                                class_ = 'row').find_all('a', 
                                                                                         class_ = 'bxr-item-image-wrap')
        urls_list += list(map(lambda x: 'https://shop.relod.ru' + x.attrs['href'], desc))
    return urls_list

In [8]:
urls = urls_getter_fast(base_url)

VBox(children=(HTML(value=''), IntProgress(value=0, max=250)))

In [9]:
assert len(urls) == len(set(urls))

In [10]:
print('Число ссылок', len(set(urls)))

Число ссылок 5000


In [17]:
import pickle

In [13]:
with open('urls.pickle', 'wb') as f:
    pickle.dump(urls, f)

In [18]:
with open('urls.pickle', 'rb') as f:
    urls = pickle.load(f)

## 2. Получение информации о книгах

In [19]:
def process_page(url):

    page = get_page(url)
    
    if page is None:
        raise BadPageError(f'Bad Page: cant get page')
        
    page_dict = {'url' : url}
    
    soup = BeautifulSoup(page.content, 'html.parser')
    #main_block - центральная часть со всеми нужными параметрами
    main_block = soup.find('div', class_='row bxr-page-content')
    main_block = main_block.find('div', class_='row').find('div', class_='bxr-container-catalog-element')
    main_block = main_block.find('div', class_='bxr-cloud-all')
    
    page_dict['Название'] = main_block.find('h1', itemprop='name').text
    
    page_dict['Метки'] = []
    markers = main_block.find('div', class_='bxr-ribbon-marker-vertical')
    markers = re.findall('\w+', markers.text)
    if markers: # если метки есть, то добавим
        page_dict['Метки'] += markers
    
    page_dict['Иллюстрации'] = list(map(lambda x:'https:' + x.attrs['data-src'],
                                        main_block.find_all('img', class_="bxr-zoom-img lazy", itemprop='image')))
    
    ratings = main_block.find('div', class_='bxr-rating-detail') #блок с рейтингом
    
    ratValue = ratings.find('meta', itemprop='ratingValue')
    ratCount = ratings.find('meta', itemprop='ratingCount')
    
    page_dict['Оценка'] = int(float(ratValue.attrs['content'])) if ratValue else 0
    page_dict['Число голосов'] = int(float(ratCount.attrs['content'])) if ratCount else 0
        
    table = main_block.find('table', class_='bxr-props-table') #таблица
    names = table.find_all('td', class_='bxr-props-name')
    datas = table.find_all('td', class_='bxr-props-data')
    loc_table = zip(names, datas)
    
    for row in loc_table:
        name = row[0].text.strip()
        data = row[1].text.strip()
        
        page_dict[row[0].text.strip()] = re.sub(r'\s+', ' ', row[1].text.strip()) 
        if name == 'Издатель':
            page_dict[name] = page_dict[name].replace('(сайт издательства)', '').strip() #в примерах его не было
    
    page_dict['Описание'] = re.sub(r'\s+', ' ', main_block.find('div', class_='bxr-detail-tab-content').text.strip())
    
    page_dict['Наличие']  = main_block.find('div', itemprop='availability').text
    
    page_dict['Цена'] = float(main_block.find('meta', itemprop='price').attrs['content'])
    
    if 'ПТВ' in page_dict['Метки']:
        scripts = soup.find_all('script') #стоимость со скидкой есть только в скрипте
        ptv_script = list(filter(lambda x: 'var ptvSettings' in x.contents[0] if len(x.contents) > 0 else False ,
                                 scripts))
        pattern = re.compile('PRICE"\s*:\s*(\d+.?\d+)')
        if ptv_script:
            page_dict['Цена (скидка)'] = float(re.findall(pattern, ptv_script[0].contents[0])[0])

    for key in page_dict: # проверим что нет None'ов в словаре 
        if page_dict[key] is None:
            raise BadPageError(f'Bad Parse: {key} field is None')
    
    return page_dict

In [20]:
import gzip
import json
import codecs
import sys

from tqdm import tqdm
from multiprocessing.dummy import Pool, Queue

In [21]:
queue = Queue()   # очередь ссылок на книги

In [22]:
for url in urls:
    queue.put(url)

In [23]:
queue.qsize()

5000

In [None]:
def process_page_wrapper(i):
    with gzip.open('data/part_{:05d}.jsonl.gz'.format(i), mode='wb') as f_json:
        f_json = codecs.getwriter('utf8')(f_json)

        while not queue.empty():
            url = queue.get()
            try:
                record = process_page(url)
            except Exception as e:
                with lock:
                    print(url, e, file=sys.stderr)
                record = dict()

            record_str = json.dumps(record, ensure_ascii=False)
            print(record_str, file=f_json)

            # счетчик должен атомарно обновиться
            with lock:
                pbar.update(1)


with Pool(processes=8) as pool, tqdm(total=queue.qsize()) as pbar:
    lock = pbar.get_lock()
    pool.map(process_page_wrapper, range(pool._processes))

100%|█████████████████████████████████████████████████████████████████████████████▉| 4996/5000 [29:25<00:01,  3.22it/s]

## 3. Объединение результатов

In [1]:
a = {}

a[(1,2,3)] = 1

In [64]:
import pandas as pd
import os

from itertools import chain         # рекомендуется использовать
from contextlib import ExitStack    # рекомендуется использовать

from typing import Generator, Dict, Any

In [82]:
def records_reader(dirname: str) -> Generator[Dict[str, Any], None, None]:
    filenames = os.listdir(dirname)

    with ExitStack() as stack:
        files = [stack.enter_context(gzip.open(f'data/part_0000{i}.jsonl.gz',
                                               mode='rb')) for i in range(len(filenames))]
        files = list(map(lambda file: codecs.getreader('utf8')(file), files))
        
        for record in chain(*files):
            yield json.loads(record)


df = pd.DataFrame(records_reader('data'))
df['index'] = df.url.apply(lambda x: urls.index(x)) #добавим столбец - номер ссылки по счету
df = df.sort_values(by = ['index']) # отсортируем, чтобы порядок был изначальный
df = df.drop('index', axis = 1) # уберем столбец
df.to_csv('hw_3.csv', index=False)

In [83]:
df.head()

Unnamed: 0,url,Название,Метки,Иллюстрации,Оценка,Число голосов,ISBN,Издатель,Автор,Язык,...,Размер (мм),Описание,Наличие,Цена,Серия,Носитель,Читательская аудитория,Цена (скидка),Тип продукта,Издание
0,https://shop.relod.ru/catalog-products/the_cas...,The Casual Vacancy,[],[https://opt-1458870.ssl.1c-bitrix-cdn.ru/uplo...,4,2,9780751552867,Sphere,Rowling J.K.,English,...,198(д) х 153(ш) х 38(в),Когда Барри Фэйрбразер умирает в начале сороко...,В наличии,892.0,,,,,,
614,https://shop.relod.ru/catalog-products/zorro/,Zorro,"[Sale, ПТВ]",[https://opt-1458870.ssl.1c-bitrix-cdn.ru/uplo...,3,6,9780007201983,Harper Collins,Allende Isabel,English,...,198(д) х 129(ш) х 27(в),Bestselling author Isabel Allende's first adul...,В наличии,182.0,,,General (US: Trade),145.6,,
1234,https://shop.relod.ru/catalog-products/wives_a...,Wives and Daughters,[Sale],[https://opt-1458870.ssl.1c-bitrix-cdn.ru/uplo...,5,1,9780199538263,Oxford University Press,Gaskell Elizabeth,English,...,195(д) х 128(ш) х 36(в),"Wives and Daughters, Elizabeth Gaskell's last ...",Под заказ,139.0,OWC (Oxford World Classics),,,,,
4388,https://shop.relod.ru/catalog-products/the_sil...,The Silmarillion,[],[https://opt-1458870.ssl.1c-bitrix-cdn.ru/uplo...,5,1,9780261102736,Harper Collins,Tolkien J.R.R.,English,...,178(д) х 111(ш) х 29(в),Designed to take fans of The Hobbit and The Lo...,Под заказ,678.0,,,,,,
1857,https://shop.relod.ru/catalog-products/the_ess...,The Essential Tales of Chekhov,[],[https://opt-1458870.ssl.1c-bitrix-cdn.ru/uplo...,5,1,9781862073005,Granta,Chekhov Anton,English,...,198(д) х 129(ш) х 26(в),"In this collection of 20 short stories, the ed...",В наличии,969.0,,,,,,


In [7]:
def f():
    try:
        print('a')
    except:
        print('b')
    else:
        print('c')
        return None
    
    print('d')

In [8]:
f()

a
c
