In [1]:
import logging
## for file logging
logging.basicConfig(
#         filename='file1.log',
        level=logging.WARNING,
        format='%(asctime)s %(levelname)s:\t%(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')

In [2]:
import re
import requests
from bs4 import BeautifulSoup
import time
from tqdm import tqdm_notebook
import sys
from multiprocessing import Pool
from multiprocessing.dummy import Pool as ThreadPool
from datetime import datetime
from lxml import etree, html as lhtml
import pandas as pd

In [3]:
authors_txt = 'hw3-data-retrieval/authors.txt'
book_links_txt = 'book_links.txt'
log_file_txt = 'log_file_1.txt'
site_name = 'https://www.respublica.ru'

attempts_number = 2

In [26]:
def process_one_author_books(site_name, author_id, f_write):
    def find_the_number_of_iterations(author_soup):
        total_books_el = author_soup.find('div', class_="rd-listing-count")
        total_books = int(re.search('из \d+', total_books_el.text).group(0)[3:])
        total_pages = total_books / 22
        if total_books % 22 != 0:
            total_pages += 1
        return int(total_pages), total_books
    def process_main_page(author_soup, curr_address):
        books_descs = author_soup.find_all('a', itemprop='name')
        if books_descs is None:
            logging.error('no books found:\t{}'.format(curr_address))
            return
        for book in books_descs:
            print(site_name + book['href'], file=f_write)
        logging.info('success:\t\t{}'.format(curr_address))  
    def process_fork_page(curr_address):
        for i in range(1, attempts_number+1):
            r_get = requests.get(curr_address)
            if (r_get.ok):
                break
            logging.error('bad get request ({}/{}):\t{}'.format(i, attempts_number, curr_address))
            time.sleep(0.1)
        if not (r_get.ok):
            return
        author_soup = BeautifulSoup(r_get.text, 'lxml')
        process_main_page(author_soup, curr_address)

    address_author_id = site_name + '/authors/' + author_id
    for i in range(1, attempts_number+1):
        r_get = requests.get(address_author_id)
        if (r_get.ok):
            break
        logging.error('bad get request ({}/{}):\t{}'.format(i, attempts_number, address_author_id))
        time.sleep(0.1)
    if not (r_get.ok):
        return
    
    author_soup = BeautifulSoup(r_get.text, 'lxml')   
    total_pages, total_books = find_the_number_of_iterations(author_soup)

    pages = []
    for curr_page in range(2, total_pages+1):
        pages.append(address_author_id + '?page=' + str(curr_page))    
    with ThreadPool(processes=total_pages) as pool:
        pool.map(process_fork_page, pages)
        process_main_page(author_soup, address_author_id)

In [27]:
def count_lines(filename):
    with open(filename) as file:
        return sum(chunk.count('\n') for chunk in iter(lambda: file.read(), ''))
    
total_authors = count_lines(authors_txt)

In [29]:
pbar = tqdm_notebook(total=total_authors)

with open(authors_txt, 'r') as f_read:
    with open(book_links_txt, 'w') as f_write:
        line = f_read.readline()
        while line:
            author_id = line[:-1]
            logging.info('start process:\t\t{}'.format(author_id))        
            process_one_author_books(site_name, author_id, f_write)
            pbar.update(1)
            line = f_read.readline()
logging.info('completed')        

HBox(children=(IntProgress(value=0, max=35), HTML(value='')))

# 2 этап

In [6]:
def process_field(field_name, tree, card):
    if field_name == 'Категория':
        el = tree.xpath('//div[@class="rd-page-breadcrumbs rd-page-product__breadcrumbs"]//text()')
        if len(el) == 0:
            return
        el = [re.search('\w+.*\w', c).group(0) for c in el]
        card[field_name] = '; '.join(el)
    elif field_name == 'Название':
        el = tree.xpath('//h1[@class="rd-page-product__title"]/text()')
        if len(el) == 0:
            return
        card[field_name] = el[0]
    elif field_name == 'Автор':
        el = tree.xpath('//a[@itemprop="brand"]/text()')
        if len(el) == 0:
            return
        card[field_name] = el[0]
    elif field_name == 'ID':
        el = tree.xpath('//span[@itemprop="sku"]/text()')
        if len(el) == 0:
            return
        card[field_name] = el[0]
    elif field_name == 'Превью':
        el = tree.xpath('//a[@class="download-pdf"]/attribute::href')
        if len(el) == 0:
            return
        card[field_name] = site_name + el[0]
    elif field_name == 'Изображение':
        el = tree.xpath('//link[@rel="image_src"]/attribute::href')
        if len(el) == 0:
            return
        card[field_name] = el[0]
    elif field_name == 'Цена':
        el = tree.xpath('//span[@class="num"]/text()')
        if len(el) == 0:
            return
        card[field_name] = el[0]
    elif field_name == 'В наличии':
        el = tree.xpath('//span[@class="rd-page-product__buy-text"]/text()')
        if len(el) == 0:
            return
        if el[0] == 'Купить':
            card[field_name] = True
        else:
            card[field_name] = False
    elif field_name == 'Описание':
        el = tree.xpath('//div[@class="rd-page-product__desc-body"]/text()')
        if len(el) == 0:
            return
        card[field_name] = el[0]
    elif field_name == 'Характеристики':
        path_to_els = tree.xpath('//div[@class="rd-page-product__desc-params"]')[0]
        list_of_names = path_to_els.xpath('p[@class="rd-page-product__desc-param"]//text()')[0::3]
        list_of_els = path_to_els.xpath('p[@class="rd-page-product__desc-param"]//text()')[2::3]
        for i, el in enumerate(list_of_els):
            card[list_of_names[i]] = el   
    elif field_name == 'Цена (старая)':  
        el = tree.xpath('//span[@class="prev"]/text()')
        if len(el) == 0:
            return
        card[field_name] = re.search('\w+\w', el[0]).group(0)
    elif field_name == 'Рейтинг':      
        path_to_els = tree.xpath('//span[@itemprop="aggregateRating"]')
        if len(path_to_els) == 0:
            return
        # list_of_names = path_to_els.xpath('meta//attribute::itemprop')
        list_of_names = ['Число отзывов', 'Число оценок', 'Оценка']
        list_of_els = [path_to_els[0].xpath('meta[@itemprop="reviewCount"]/attribute::content')[0],
                       path_to_els[0].xpath('meta[@itemprop="ratingCount"]/attribute::content')[0],
                       path_to_els[0].xpath('meta[@itemprop="ratingValue"]/attribute::content')[0]]
        for i, el in enumerate(list_of_els):
            card[list_of_names[i]] = el    

In [7]:
def process_page(url):
    card = dict()
    for i in range(1, attempts_number+1):
        r_get = requests.get(url)
        if not (r_get.ok):
            with open(log_file_txt, 'a') as f:
                f.write('{}\t:\tFAIL\t:\tBad get request ({}/{}):\t{}\n'.format(
                    str(datetime.now())[:-7], i, attempts_number, url))
        time.sleep(0.1)
    if not (r_get.ok):
        return
            
    tree = lhtml.fromstring(r_get.text)
    field_list = ['Категория', 'Название', 'Автор', 'ID', \
              'Превью', 'Изображение', 'Цена', 'В наличии', \
              'Описание', 'Характеристики', 'Цена (старая)', 'Рейтинг']
    for field in field_list:
        process_field(field, tree, card) 
        
    with open(log_file_txt, 'a') as f:
        f.write('{}\t:\tSUCCESS\t:\t{}\n'.format(str(datetime.now())[:-7], url))
    return card

In [8]:
log_file_txt = 'log_file_2.txt'
max_proc = 10

In [9]:
total_books = count_lines(book_links_txt)
pbar = tqdm_notebook(total=total_books)

result = []
with open(book_links_txt, 'r') as f_read:
    urls = []
    line = f_read.readline()
    while line:
        urls.append(line[:-1])
        if len(urls) == max_proc:
            with Pool(processes=max_proc) as pool:
                result += pool.map(process_page, urls)
            pool.join()
            urls = []   
            pbar.update(max_proc)
        line = f_read.readline()
    if len(urls) != 0:
        with Pool(processes=len(urls)) as pool:
            result += pool.map(process_page, urls)
        pbar.update(len(urls))

df = pd.DataFrame(list(filter(None, result)))
df.sort_values(by=['ID'], inplace=True)
with open('hw_3.csv', mode='w', encoding='utf-8') as f_csv:
    df.to_csv(f_csv, index=False)

HBox(children=(IntProgress(value=0, max=2451), HTML(value='')))

In [10]:
df.head()

Unnamed: 0,ID,ISBN,Автор,В наличии,"Вес, г",Вид бумаги,Возраст,Возрастные ограничения,Герои,Год издания,...,Тематика,Тип,Упаковка,Формат,Цена,Цена (старая),Число отзывов,Число оценок,Эпоха,Язык
2050,33,978-5-389-03203-3,Уильям Шекспир,False,,,,,,2015,...,,,,11 х 18,100,140.0,,,,Русский
2017,56,978-5-389-03202-6,Уильям Шекспир,False,,,,,,2011,...,,,,11 х 18,100,140.0,,,,Русский
1883,21678,,Agatha Christie,False,,,,,,2001,...,,,,10 х 17,610,,,,,Английский
1880,21679,,Agatha Christie,False,,,,,,2011,...,,,,11 х 17,610,,,,,Английский
1863,21680,,Agatha Christie,False,,,,,,2011,...,,,,11 х 17,510,610.0,,,,Английский


In [11]:
df.describe()

Unnamed: 0,ID,ISBN,Автор,В наличии,"Вес, г",Вид бумаги,Возраст,Возрастные ограничения,Герои,Год издания,...,Тематика,Тип,Упаковка,Формат,Цена,Цена (старая),Число отзывов,Число оценок,Эпоха,Язык
count,2451,2138,2451,2451,767,3,384,745,3,2229,...,1,208,139,2432,2451,462,89,89,1,2233
unique,2450,2137,44,2,405,2,14,7,3,19,...,1,8,4,215,313,108,4,4,1,5
top,515553,978-5-04-100917-5,Стивен Кинг,False,250,Мелованная,0+,16+,Sandman,2015,...,Животные,Принт,Подложка,11 х 18,240,290,1,1,Средневековая,Русский
freq,2,2,198,1991,10,2,104,382,1,367,...,1,192,83,480,75,37,74,74,1,2143


In [12]:
df.columns

Index(['ID', 'ISBN', 'Автор', 'В наличии', 'Вес, г', 'Вид бумаги', 'Возраст',
       'Возрастные ограничения', 'Герои', 'Год издания', 'Жанр',
       'Издательство', 'Изображение', 'Иллюстратор', 'Иллюстрации',
       'Категория', 'Количество страниц', 'Материал', 'Название', 'Назначение',
       'Направление', 'Обложка', 'Описание', 'Оценка', 'Пол', 'Превью',
       'Раздел', 'Размер, см', 'Рисунок', 'Серия', 'Страна-производитель',
       'Тематика', 'Тип', 'Упаковка', 'Формат', 'Цена', 'Цена (старая)',
       'Число отзывов', 'Число оценок', 'Эпоха', 'Язык'],
      dtype='object')

In [13]:
df['Назначение'].unique()

array([nan, 'Творческое развитие', 'Развитие мышления', 'Универсальные'],
      dtype=object)

In [14]:
df[df['Пол'] == 'Унисекс']
# .unique()

Unnamed: 0,ID,ISBN,Автор,В наличии,"Вес, г",Вид бумаги,Возраст,Возрастные ограничения,Герои,Год издания,...,Тематика,Тип,Упаковка,Формат,Цена,Цена (старая),Число отзывов,Число оценок,Эпоха,Язык
1126,376854,978-5-00057-127-9,Тору Кумон,False,,,2+,,,2015,...,,Задания,,,500,,,,,Русский
1125,376855,978-5-00057-129-3,Тору Кумон,False,,,1+,,,2015,...,Животные,Обучающие,,,440,,,,,Русский
1158,379231,978-5-00057-131-6,Тору Кумон,True,,,5+,,,2015,...,,Головоломки,,,530,,,,,Русский
