In [13]:
# Libraries
import os
import requests
import json
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

import time
import random

In [14]:
# variables defined before scraping
data_dir = os.path.join("..", "Datasets", "db")
product_t = pd.read_csv(os.path.join(data_dir, "Products.csv"))

headers = {"User-Agent": "<user-agent>"
          }

product_list = product_t["product_name"].unique().tolist()
product_details = []
product_details_cols = ["product", "product_similarity", "price", "rating", "availability", 
                        "review_count", "amazon_product_name", "review_date", "review_score", "review_text"]
amazon_url = "https://www.amazon.com" 

In [4]:
## Scraping Functions - Product Level
# function to get product title
def get_product_title(soup):
    try:
        # Outer Tag Object
        title = soup.find("span", attrs={"id":'productTitle'})
        # Inner NavigableString Object
        title_value = title.string
        # Title as a string value
        title_string = title_value.strip()
    except AttributeError:
        title_string = ""
    return title_string

# Function to extract Product Price
def get_product_price(soup):
    try:
        price = soup.find("span", attrs={'class':'a-price'}).select_one("span").string.strip()

    except AttributeError:
        price = ""
    return price

# Function to extract Product Rating
def get_product_rating(soup):
    try:
        rating = soup.find("i", attrs={'class':'a-icon a-icon-star a-star-4-5'}).string.strip()
    except AttributeError:
        try:
            rating = soup.find("span", attrs={'class':'a-icon-alt'}).string.strip()
        except:
            rating = ""	
    return rating

# Function to extract Number of User Reviews
def get_product_review_count(soup):
    try:
        review_count = soup.find("span", attrs={'id':'acrCustomerReviewText'}).string.strip()

    except AttributeError:
        review_count = ""
    return review_count

# Function to extract Availability Status
def get_product_availability(soup):
    try:
        available = soup.find("div", attrs={'id':'availability'})
        available = available.find("span").string.strip()
    except AttributeError:
        available = ""
    return available

# Function to get link that shows all reviews related to the product  
def get_product_reviews_link(soup):
    try:
        review_footer = soup.find_all("div", attrs={"id":"reviews-medley-footer"})[0]
        all_reviews_link = review_footer.find("a").get("href")
    except:
        all_reviews_link = ""
    return all_reviews_link



## Scraping Functions - Review Level
# Function to get date info from product review
def get_review_date(soup):
    try:
        soup_review_date = soup.find("span", attrs={"class": "review-date"}).text
        soup_review_date = soup_review_date[soup_review_date.index("on") + 3:]
    except:  
        soup_review_date = ""
    return soup_review_date

# Function to get review score from product review
def get_review_score(soup):
    try:
        soup_review_score = soup.select_one('a[title*="of 5 stars"]').get("title")[:3]
    except:
        soup_review_score = ""
    return soup_review_score

# Function to get review text content from product review
def get_review_text(soup):
    try:
        soup_review_text = soup.find("span", attrs={"class": "review-text-content"}).\
                                    select_one("span > span").text
    except:
        soup_review_text = ""
    return soup_review_text
    

In [5]:
## helper function
# verify similarity of product page and main product scraped for
def verify_sim(main_txt, sub_txt):
    main_txt, sub_txt = str(main_txt).lower(), str(sub_txt).lower()
    print("{} VS. {}".format(main_txt, sub_txt))
    sub_txt_elems = sub_txt.split()
    main_txt_len = len(main_txt.split())
    sim_score = 0
    for word_i in sub_txt_elems:
        if word_i in main_txt:
            sim_score += (1/main_txt_len)
    sim_score = 0.6 * sim_score + 0.4 * (1/(main_txt_len))
    
    return sim_score

In [6]:
## Main Functions for Running the Web Scraper
def get_product_review_data(start_url, url_suffix, product_data):
    product_reviews_page_number = 1
    global product_details
    
    while True:
        time.sleep(random.randint(3, 5))
        try:
            product_reviews_response = requests.get(start_url + url_suffix,
                                                    headers=headers)
        except:
            break

        pr_soup = BeautifulSoup(product_reviews_response.content, "html.parser")
        # print(" - - " +  start_url + url_suffix, "- page {}".format(product_reviews_page_number))
        pr_review_section = pr_soup.find("div", attrs={"id": "cm_cr-review_list"})
        if pr_review_section == None:
            break
        if len(pr_review_section.text.strip()) == 0:
            # print("- - DONE")
            break
        pr_rs_reviews = pr_review_section.find_all("div", attrs={"class": "a-section review aok-relative"})
        pr_rs_nav = pr_review_section.find("div", attrs={"class": "a-form-actions a-spacing-top-extra-large"})

        for pr_rs_review in pr_rs_reviews:
            pr_rs_date = get_review_date(pr_rs_review)
            pr_rs_score = get_review_score(pr_rs_review)
            pr_rs_text = get_review_text(pr_rs_review)
            # print("- - review date: {}".format(pr_rs_date))
            # print("- - review score: {}".format(pr_rs_score))
            # print()
            
            # ADDING DATA
            product_details.append((product_data["product_name"], product_data["similarity_score"], 
                    product_data["product_price"], product_data["product_rating"], 
                    product_data["product_availability"], product_data["product_review_number"],
                    product_data["amazon_product_name"], pr_rs_date, pr_rs_score, pr_rs_text))

        url_suffix = url_suffix.replace("cm_cr_getr_d_paging_btm_next_" + str(product_reviews_page_number), 
                            "cm_cr_getr_d_paging_btm_next_" + str(product_reviews_page_number+1))
        url_suffix = url_suffix.replace("&pageNumber=" + str(product_reviews_page_number),
                                  "&pageNumber=" + str(product_reviews_page_number+1))

        product_reviews_page_number += 1
        
        if product_reviews_page_number == 3:
            break
        
    

def get_product_data(main_url, soup, product_name):
    # get first 5 links
    product_links = list(map(lambda i: i.get('href'), 
                             soup.find_all("a", 
                                   attrs={'class':'a-link-normal s-no-outline'})[:random.randint(1, 4)]))
    timeout_counter = 0
    while len(product_links) == 0: 
        time.sleep(random.randint(10, 30))
        product_links = list(map(lambda i: i.get('href'), 
                             soup.find_all("a", 
                                   attrs={'class':'a-link-normal s-no-outline'})[:random.randint(1, 4)]))
        
        timeout_counter += 1
        if timeout_counter == 100:
            break
            
    for pl_i in product_links:
        # print(" - " + main_url + pl_i)
        time.sleep(random.randint(3, 5))
        try:
            product_page_response = requests.get(main_url + pl_i, headers=headers)
        except:
            continue
        pp_soup = BeautifulSoup(product_page_response.content, "html.parser")

        pp_title = get_product_title(pp_soup)
        # print(" - Product Title =", pp_title)

        pp_sim_score = str(verify_sim(product_name, pp_title))
        # print(" - Product Similarity Score = {}".format(pp_sim_score))

        pp_price = str(get_product_price(pp_soup))
        # print(" - Product Price =", pp_price)

        pp_rating = str(get_product_rating(pp_soup))
        # print(" - Product Rating =", pp_rating)

        pp_review_num = str(get_product_review_count(pp_soup))
        # print(" - Number of Product Reviews =", pp_review_num)

        pp_availability = get_product_availability(pp_soup)
        # print(" - Availability =", pp_availability)

        pp_reviews_href = get_product_reviews_link(pp_soup)
        if len(pp_reviews_href.strip()) == 0:
            continue
        else:
            pp_reviews_href = pp_reviews_href.replace("ref=cm_cr_dp_d_show_all_btm?", "ref=cm_cr_getr_d_paging_btm_next_1?") + "&pageNumber=1"
        # print(" - Product Reviews link: {}".format(main_url + pp_reviews_href))
        # print()
        
        product_dict = {"product_name": product_name,
                        "amazon_product_name": pp_title, 
                        "similarity_score": pp_sim_score,
                        "product_price": pp_price,
                        "product_rating": pp_rating,
                        "product_review_number": pp_review_num,
                        "product_availability": pp_availability}
        get_product_review_data(main_url, pp_reviews_href, product_dict)
    

In [11]:
#### RUN ####
def run_amazon_scraper():
    for product_i in product_list:
        # 1 - product search 
        print("-" * 100)
        print(product_i)
        product_parse_url = product_i.replace(" ", "+")
        product_search_url = "https://www.amazon.com/s?k={}&ref=nb_sb_noss_1".format(product_parse_url)
        # print(product_search_url)
        response_i = requests.get(product_search_url, headers=headers)
        # print(response_i.status_code)
        links_soup = BeautifulSoup(response_i.content, "html.parser")
        get_product_data(amazon_url, links_soup, product_i)

run_amazon_scraper()

In [10]:
pd.DataFrame(product_details, columns=product_details_cols).to_csv("../Datasets/product_amazon_data.csv")