In [15]:
from bs4 import BeautifulSoup
import requests
import unicodedata
import pandas as pd
from multiprocessing.pool import ThreadPool as Pool
from threading import Lock

In [16]:
def extract_book_list(author_id):
    book_ids = []
    author_url = f"https://www.moscowbooks.ru/catalog/author/{author_id}"
    author_html = requests.get(author_url).text
    soup = BeautifulSoup(author_html, 'lxml')
    pages = soup.find('a', title='перейти на последнюю страницу')
    if pages is None:
        pages = 1
    else:
        pages = int(pages['data-ajaxpage'])
    for i in range(1,pages+1):
        page_url = f"https://www.moscowbooks.ru/catalog/author/{author_id}/?page={i}"
        page_html = requests.get(page_url).text
        soup = BeautifulSoup(page_html, 'lxml')
        page_book_ids = soup.find_all('a', {'data-prices':True})
        for j in page_book_ids:
            book_ids.append(j['data-productid'])
    return book_ids

In [17]:
def extract_book_info(book_id):
    book_url = f'https://www.moscowbooks.ru/book/{book_id}'
    book_html = requests.get(book_url).text
    soup = BeautifulSoup(book_html, 'lxml')
    
    book_info = {"Код товара": book_id}
    
    author_name = soup.find('div', class_='page-header__author').find_all('a', class_="author-name")
    for i in range(len(author_name)):
        author_name[i] = author_name[i].text
    book_info["Автор"] = ', '.join(author_name)
    
    book_name = soup.find('span', class_="link-gray-light").text
    book_name = unicodedata.normalize("NFKD", book_name)
    book_info["Название"] = book_name
    
    book_info["Обложка"] ='https://www.moscowbooks.ru' + soup.find('img', class_="book__img book__img_default gallery__img")['src']
    
    book_info["Рейтинг"] = int(soup.find('div', class_="book___rating-stars")['data-rate'])
    
    book_stickers = soup.find('div', class_='book__stickers')
    if not book_stickers:
        book_stickers = None
    else:
        book_stickers = book_stickers.find_all('div', class_='label')
        for i in range(len(book_stickers)):
            book_stickers[i] = book_stickers[i].text
        book_stickers = ', '.join(book_stickers)
        book_info["Стикеры"] = book_stickers
    
    book_instock = (soup.find('div', class_="book__shop-details").text != '\n')
    book_info["Наличие"] = book_instock
    
    if book_instock:
        book_price = soup.find('div', class_="book__price").text
        book_price = int(book_price.split()[0])
        book_info['Цена'] = book_price
    
    book_description = soup.find('div', class_='book__description')
    if not (book_description is None):
        if not (book_description.a is None):
            book_description.a.extract()
        if not (book_description.b is None):
            book_description.b.extract()
            for br in book_description.find_all("br"):
                br.replace_with("\n")
            book_description = book_description.text.strip('\n')
            book_info['Описание'] = book_description
    
    book_table = soup.find_all('dl', class_='book__details-item')
    if book_table:
        book_dict = {node.find('dt', class_='book__details-name').text.strip():node.find('dt', class_='book__details-value').text.strip() for node in book_table}
        book_info.update(book_dict)
    
    return book_info

In [18]:
author_ids = {
  "Достоевский Ф. М.": 9150,
  "Роллинс Дж.": 59396,
  "Фицджеральд Ф. С.": 28727,
  "Глуховский Д. А.": 53427,
  "Стругацкий А. Н.": 26268,
  "Лукьяненко С. В.": 16626,
  "Фрай М.": 28927,
  "Хантер Э.": 37969,
  "Роулинг Дж. К.": 104832
}

In [19]:
book_ids = []
for i,j in a.items():
    book_ids.extend(extract_book_list(j))

In [20]:
mutex = Lock()
n_processed = 0

def func_wrapper(book_ids):
    res = extract_book_info(book_ids)
    with mutex:
        # в этом блоке можно безопасно менять общие объекты для процессов
        global n_processed
        n_processed += 1
        if n_processed % 10 == 0:
            print(f"\r{n_processed} books are processed...", end='', flush=True)
    return res

with Pool(processes=20) as pool:
    result = pool.map(func_wrapper, book_ids)
df = pd.DataFrame(result)
df.sort_values(by=['Код товара'], inplace=True)
df.head()

240 books are processed...

Unnamed: 0,ISBN:,Автор,Артикул:,Бумага:,В продаже с:,Вес:,Возраст:,Год издания:,Год производства:,Издательство:,...,Стикеры,Страниц:,Тип обложки:,Тираж:,Тиснение:,Формат:,Футляр:,Цена,Язык оригинала:,Язык текста:
85,978-5-17-114038-0,Фицджеральд Ф. С.,ASE000000000842319,,27.07.2019,260 гр.,,2019,,АСТ,...,,320,Мягкая обложка,2000 экз.,,60х84 1/16,,224,,английский
2,978-5-17-115705-0,Достоевский Ф. М.,ASE000000000843846,,30.07.2019,240 гр.,,2019,,АСТ,...,,384,Мягкая обложка,5000 экз.,,76х100 1/32,,144,,русский
120,978-5-905909-26-9,"Стругацкий А. Н., Стругацкий Б. Н.",978-5-905909-26-9,,31.07.2019,490 гр.,,2019,,Издательство Сидорович,...,,384,Твердый переплет (ткань),350 экз.,,84х108 1/32,,3,,русский
121,978-5-905909-27-6,"Стругацкий А. Н., Стругацкий Б. Н.",978-5-905909-27-6,,31.07.2019,760 гр.,,2019,,Издательство Сидорович,...,,624,Твердый переплет (ткань),350 экз.,,84х108 1/32,,3,,русский
158,978-5-17-117177-3,Лукьяненко С. В.,ASE000000000845549,,15.08.2019,655 гр.,,2019,,АСТ,...,,672,Твердый переплет,2000 экз.,,60х90 1/16,,566,,русский


In [21]:
with open('data/hw_3.csv', mode='w', encoding='utf-8') as f_csv:
    df.to_csv(f_csv, index=False)