In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import random
from lxml import html
import requests
import gc

In [2]:
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
headers = {'User-Agent': user_agent}

ERROR = 'ERROR'
site = 'https://www.amazon'

In [3]:
def get_id(domen, item, max_tries=3):
    url = f'{site}.{domen}/s?k={item}&s=review-rank'
    ok_response_code = '200'
    
    def _get_response(rs, cnt_=0):
        cnt = cnt_
        while ok_response_code not in str(rs):
            cnt += 1
            if cnt > max_tries:
                return rs
            try:
                rs = requests.get(url, headers)
            except Exception as e:
                print(f"Can't connect to {item} by exception {e}")
                continue
            return _get_response(rs, cnt)
        return rs
    try:
        page = requests.get(url, headers = headers)
    except Exception as e:
        print(f"Can't connect to {item} by exception {e}")
        return ERROR
        
    page = _get_response(page, cnt_=0)
    
    if ok_response_code not in str(page):
        return ERROR
    
    parser = html.fromstring(page.content)
    
    xpath_div  = './/div[@class="s-result-item s-asin sg-col-0-of-12 sg-col-16-of-20 sg-col s-widget-spacing-small sg-col-12-of-16"]/@data-asin'
    ids = parser.xpath(xpath_div)
    
    if len(ids) == 0:
        xpath_div  = './/div[@class="sg-col-4-of-12 s-result-item s-asin sg-col-4-of-16 sg-col s-widget-spacing-small sg-col-4-of-20"]/@data-asin'
        ids = parser.xpath(xpath_div)
    
    if len(ids) == 0:
        xpath_div  = './/div[@class="sg-col-4-of-24 sg-col-4-of-12 s-result-item s-asin sg-col-4-of-16 sg-col s-widget-spacing-small sg-col-4-of-20"]/@data-asin'
        ids = parser.xpath(xpath_div)
    
    try:
        return ids[0]
    except Exception as e:
        print(f"Can't find by exception: e")
        
    return ERROR

In [4]:
books_init_df = pd.read_csv('books_2.csv', usecols=['title', 'authors'], error_bad_lines=False)

books_init_df['title_']  = books_init_df['title'].copy()
# Убираем записи в скобках, чтобы поиск был без ошибки
books_init_df.loc[(books_init_df['title'].str.contains('Potter')) | (books_init_df['title'].str.contains('Hitchhiker')), 'title_'] = books_init_df\
.loc[(books_init_df['title'].str.contains('Potter')) | (books_init_df['title'].str.contains('Hitchhiker')), 'title'].str.replace(r' \(.*$', '')
books_init_df['title_author'] = books_init_df['title_'].copy()
# Добавляем к книгам автора, чтобы поиск был более точным
books_init_df.loc[(~books_init_df['title_'].str.contains('Potter')) & (~books_init_df['title_'].str.contains('Hitchhiker')), 'title_author'] = books_init_df['title_'] + ' ' + books_init_df['authors']
#  Убираем 'Harry Potter and', чтобы поиск был без ошибки
books_init_df['title_author'] = books_init_df['title_'].replace('Harry Potter and ','', regex=True)
# Добавляем слово 'book', чтобы искал только книги, а не фильмы
books = ('book ' + books_init_df['title_author']).unique()

# books = books_init_df['title_'].unique()

len(books)

10351

In [5]:
def get_items(items, domen):
    items_dict = {}
    for item in tqdm(items):
        cur_id = get_id(domen, item=item)
        if cur_id != ERROR:
            items_dict[item] = {}
            items_dict[item]['id'] = cur_id
    return items_dict

In [6]:
def get_review(domen, item_id, n_pages=10):
    res_dict = {}
    for n_page in range(1, n_pages+1):
        url = f'https://www.amazon.{domen}/product-reviews/{item_id}?pageNumber={n_page}&sortBy=recent'
        try:
            page = requests.get(url, headers = headers)
        except Exception as e:
            print(f"Can't connect to by EXCEPTION: {e}")
            return ERROR
        parser = html.fromstring(page.content)

        xpath_reviews = '//div[@data-hook="review"]'
        reviews = parser.xpath(xpath_reviews)
        
        xpath_rating  = './/span[@class="a-icon-alt"]//text()'
    
        xpath_title   = './/a[@data-hook="review-title"]//span//text()'
        xpath_style   = './/a[@data-hook="book-style"]//span//text()'
        xpath_author_id  = './/span[@class="a-profile-id"]//text()'
        xpath_author  = './/span[@class="a-profile-name"]//text()'
        xpath_date    = './/span[@data-hook="unixReviewTime"]//text()'
        xpath_body    = './/span[@data-hook="review-body"]//text()'
        xpath_verified = './/span[@data-hook="verified-statement"]//text()'

        page_dict = {}
        i = 0
        for review in reviews:
            rating = review.xpath(xpath_rating)
            title = review.xpath(xpath_title)
            style = review.xpath(xpath_style)
                
            author = review.xpath(xpath_author)
            author_id = review.xpath(xpath_author_id)
            date = review.xpath(xpath_date)
            body = review.xpath(xpath_body)
            verified_ = review.xpath(xpath_verified)
            verified = True if len(helpful_) != 0 else False
            review_dict = {'asin' : item_id,
                           'reviewerID' : author_id,
                           'reviewerName': author,
                           'overall': rating,
                           'unixReviewTime': date,
                           'summary': title,
                           'style': style,
                           'reviewText': body,
                           'verified': verified}
            
            res_dict[len(res_dict) + i] = review_dict
            i += 1
    return res_dict

In [None]:
get_items(books, 'com').to_csv('new_book_ids.csv')

In [None]:
%%time
cur_items = pd.read_csv('new_book_ids.csv')
cur_items = cur_items.rename({'Unnamed: 0': 'item'}, axis=1)
res_df = pd.DataFrame(columns=['asin', 'reviewerID', 'reviewerName', 'style','overall', 'unixReviewTime', 'summary', 'reviewText', 'verified'])
total_memory = 0

for domen in ['com']:    
    thr = cur_items.shape[0] / 8
    for begin, end in [(1056 * 4, 1056 * 5), (1056 * 5, 1056 * 6), (1056 * 6, 1056 * 7), (1056 * 7, cur_items.shape[0])]:
        res_df = pd.DataFrame(columns=['asin', 'reviewerID', 'reviewerName','style','overall', 'unixReviewTime', 'summary', 'reviewText', 'verified'])
        for index, row in tqdm(cur_items.iloc[begin:end].iterrows(),  total=cur_items.iloc[begin:end].shape[0]):
            item_id = row['asin']
            book_dict = get_review(domen, item_id, n_pages=15)
            if book_dict == ERROR:
                continue

            cur_df = pd.DataFrame.from_dict(book_dict, orient='index')

            res_df = res_df.append(cur_df)
            del cur_df
        
        res_df.to_json(f'new_{domen}_books_{begin}_{end}.json')

        del res_df
        gc.collect()

  0%|                                                                                         | 0/1056 [00:00<?, ?it/s]

DOMEN: co.jp


 15%|███████████▍                                                                 | 157/1056 [37:48<2:03:12,  8.22s/it]

Can't connect to book The Dark City (Eliot Ness  #1) by EXCEPTION: ('Connection aborted.', ConnectionResetError(10054, 'Удаленный хост принудительно разорвал существующее подключение', None, 10054, None))


 26%|███████████████████▋                                                       | 277/1056 [1:06:22<2:48:22, 12.97s/it]

Can't connect to book Diary of a Spider by EXCEPTION: ('Connection aborted.', ConnectionResetError(10054, 'Удаленный хост принудительно разорвал существующее подключение', None, 10054, None))


 86%|██████████████████████████████████████████████████████████████████▍          | 911/1056 [3:25:21<30:56, 12.80s/it]