Written as part of https://www.scrapehero.com/how-to-scrape-amazon-product-reviews-using-python/, modified by us.

To do:
* Alter so that we are instead (also?) input the link with all the product's reviews. 
    * this is turning out to be problematic - the page with all the reviews still only has 10 at a time, so doesn't appear to be more efficient unless we can tell program to click to next page of reviews and continue that way.

In [7]:
from lxml import html
from json import dump,loads
from requests import get
import json
from re import sub
from dateutil import parser as dateparser
from time import sleep

In [32]:
def ParseReviews(amazon_url):
    '''
    Given a url to an amazon product, 
    '''
    # Add some recent user agent to prevent amazon from blocking the request 
    # Find some chrome user agent strings  here https://udger.com/resources/ua-list/browser-detail?browser=Chrome
    headers = {'User-Agent': '''Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'''
                               '''(KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'''}
    
    # try getting the data 5 times. Will only retry getting data if response code isn't 200
    for i in range(5): 
        # The response is the whole page (html, css, javascript, response code)
        response = get(amazon_url, headers = headers, verify=False, timeout=30)
        if response.status_code == 404:
            return {"url": amazon_url, "error": "page not found"}
        if response.status_code != 200: # checks whether to retry getting the page.
            continue
        
        # Removing the null bytes from the response. 
        cleaned_response = response.text.replace('\x00', '') 

        # get html in tree structure that can be parsed with XPath
        parser = html.fromstring(cleaned_response) 
        
        XPATH_AGGREGATE = '//span[@id="acrCustomerReviewText"]'
        XPATH_REVIEW_SECTION_1 = '//div[contains(@id,"reviews-summary")]'
        XPATH_REVIEW_SECTION_2 = '//div[@data-hook="review"]'
        XPATH_AGGREGATE_RATING = '//table[@id="histogramTable"]//tr'
        XPATH_PRODUCT_NAME = '//h1//span[@id="productTitle"]//text()'
        
        raw_product_name = parser.xpath(XPATH_PRODUCT_NAME)
        total_ratings  = parser.xpath(XPATH_AGGREGATE_RATING)
        reviews = parser.xpath(XPATH_REVIEW_SECTION_1)

        product_name = ''.join(raw_product_name).strip()

        if not reviews:
            reviews = parser.xpath(XPATH_REVIEW_SECTION_2)
        
        ratings_dict = {}
        reviews_list = []

        # Grabing the rating section in product page
        for ratings in total_ratings:
            extracted_rating = ratings.xpath('./td//a//text()')
            if extracted_rating:
                rating_key = extracted_rating[0] 
                rating_value = extracted_rating[1]
                if rating_key:
                    ratings_dict.update({rating_key: rating_value})
        
        # Parsing individual reviews
        for review in reviews:
            XPATH_RATING  = './/i[@data-hook="review-star-rating"]//text()'
            XPATH_REVIEW_TEXT_1 = './/div[@data-hook="review-collapsed"]//text()'
            XPATH_REVIEW_TEXT_2 = './/div//span[@data-action="columnbalancing-showfullreview"]/@data-columnbalancing-showfullreview'
            XPATH_REVIEW_TEXT_3 = './/div[contains(@id,"dpReviews")]/div/text()'
            
            raw_review_rating = review.xpath(XPATH_RATING)
            raw_review_text1 = review.xpath(XPATH_REVIEW_TEXT_1)
            raw_review_text2 = review.xpath(XPATH_REVIEW_TEXT_2)
            raw_review_text3 = review.xpath(XPATH_REVIEW_TEXT_3)

            # Cleaning data
            review_rating = ''.join(raw_review_rating).replace('out of 5 stars', '')
            review_text = ' '.join(' '.join(raw_review_text1).split())

            # Grabbing hidden comments if present
            if raw_review_text2:
                json_loaded_review_data = loads(raw_review_text2[0])
                json_loaded_review_data_text = json_loaded_review_data['rest']
                cleaned_json_loaded_review_data_text = re.sub('<.*?>', '', json_loaded_review_data_text)
                full_review_text = review_text+cleaned_json_loaded_review_data_text
            else:
                full_review_text = review_text
            if not raw_review_text1:
                full_review_text = ' '.join(' '.join(raw_review_text3).split())

            reviews_list.append({'review_text': full_review_text, 'review_rating': review_rating})

        data = { 'name': product_name,
                 'url': amazon_url,
                 'ratings': ratings_dict,
                 'reviews': reviews_list  }
        
        return data

    return {"error": "failed to process the page", "url": amazon_url}
            

In [37]:
def scrapeAmazonReviews():
    '''
    Scrapes the user-inputted website for reviews, writing them
    into cur_link.json. 
    Assumes this will be an amazon product page.
    '''
    link = input('Website link:')
    
    extracted_data = ParseReviews(link)
    if len(extracted_data['name']) > 10:
        f = open(extracted_data['name'][:10] + '..._product_reviews.json', 'w')
    else:
        f = open(extracted_data['name'][:10] + '_product_reviews.json', 'w')
    dump(extracted_data, f, indent=4)
    f.close()

In [38]:
if __name__ == '__main__':
    scrapeAmazonReviews()

Website link:https://www.amazon.com/Neutrogena-Fullreach-Sunscreen-Lightweight-Non-comedogenic/dp/B076H5P1S4?ref=FRESH_ENG_NewtoFresh&fpw=fresh&pf_rd_p=ae415af6-2a6e-44d7-a239-c91943a0d7d0&pf_rd_r=JWA309CKCYNXM4CRTM3A


