In [1]:
!pip install selenium



In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import random
import os
from datetime import datetime

In [3]:
chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

driver = webdriver.Chrome(chromedriver)
driver.get("https://www.amazon.com/gp/site-directory/")

In [4]:
# Find and click Luggage category
luggage_link = driver.find_element_by_xpath("//a[contains(text(), 'Luggage')]")
luggage_link.click()

In [5]:
# Click on Luggage Subcategory
all_sub_categories = driver.find_elements_by_xpath("//div[starts-with(@id, 'categoryTiles')]/div/div/div")
all_sub_categories[0].click() # For Luggage
# all_sub_categories[1].click() # For backpacks
# all_sub_categories[2].click() # For duffels
# all_sub_categories[3].click() # For Messenger Bag


In [6]:
# Get brand and product name
def get_brand(current_driver):    
    try:
        brand_name = current_driver.find_element_by_xpath("//div[contains(@data-feature-name, 'brandByline') or contains(@data-feature-name, 'bylineInfo')]//a").text
    except:
        brand_name = ''
    return brand_name.strip().replace(',', '')

def get_product(current_driver):
    try:
        product_name = current_driver.find_element_by_xpath("//span[@id='productTitle' or @id='btAsinTitle']").text
    except:
        product_name = 'None Found'
    return product_name.strip().replace(',', '')

In [7]:
def get_price(current_driver):
    try:
        product_price = driver.find_element_by_xpath("//span[contains(@id, 'priceblock_ourprice')]").text.replace(',', '').replace('$', '')
    except:
        product_price = 'NA'
    return product_price

In [8]:
# Get all swatch collections
def get_all_swatches(current_driver):
    all_swatches = current_driver.find_elements_by_xpath("//ul[(contains(@class, 'swatches'))]/li")
    return [swatch.get_attribute('title')[16:].strip().replace(',', '') for swatch in all_swatches]

In [9]:
def get_no_reviews_rating(current_driver):
    try:
        no_of_reviews = current_driver.find_element_by_xpath("//span[@id='acrCustomerReviewText']").text
    except:
        no_of_reviews = 'NA'
    try:
        star_rating = current_driver.find_element_by_xpath("//span[@id='acrPopover']").get_attribute('title')
    except:
        star_rating = "NA"
    return (no_of_reviews.replace(',', ''), star_rating)

In [10]:
# if prodDetails table is found
def get_prod_details(current_driver):
    all_descriptions = current_driver.find_elements_by_xpath("//div[@id='prodDetails']//table//tr")
    prod_details = []
    for description_row in all_descriptions:
        try:
            row_title = description_row.find_element_by_xpath(".//th").text.replace('\n', '').replace(',', '')
        except:
            row_title = ''
        row_value = description_row.find_elements_by_xpath(".//td")[-1].text.replace('\n', '').replace(',', '')
        prod_details.append((row_title, row_value))
    return prod_details

In [11]:
# if prodDetails doesn't exist, check 'detail-bullets' table
# all_descriptions_rows = driver.find_elements_by_xpath("//div[@id='detail-bullets' or @id='detailBullets']//div[@class='content']//li")
def get_prod_details_alt(current_driver):
    all_descriptions_rows = current_driver.find_elements_by_xpath("//div[@id='detail-bullets' or @id='detailBullets']//li")
    prod_details = []
    for description_row in all_descriptions_rows:
        prod_details.append(description_row.text.replace('\n', '').replace(',', '').split(":"))
    return prod_details

In [12]:
def get_all_stars(current_driver):    
    all_stars = current_driver.find_element_by_xpath("//table[@id='histogramTable']/tbody").find_elements_by_xpath("./tr")
    star_ratings = []
    for star in all_stars:
        try:
            star_data = star.find_element_by_xpath("./td/a").get_attribute('aria-label').replace(',', '')
        except:
            star_data = 'N/A'
        star_ratings.append(star_data)
    return star_ratings
        

In [13]:
file_name = str(datetime.now()) + '_amazon_scraped.csv'
f = open(file_name,'a')
f.write(','.join(['Brand', 'Name', 'Price', 'Styles', 'Review Count', 'Star Rating', 'Product Details', 'Product Details Alt', '5 Stars', '4 Stars', '3 Stars', '2 Stars', '1 Stars', 'Url']) + '\n') #Give your csv text here.

## Python will convert \n to os.linesep
total_scraped = 0
while total_scraped < 5000:
    try:
        search_results = driver.find_elements_by_xpath("//div[@id='search-results']//div[@id='mainResults']//li")
        if len(search_results) < 1:
            search_results = driver.find_elements_by_xpath("//ul[contains(@id, 's-results')]//li[contains(@id, 'result')]")
        result_count = len(search_results)
        print('- Number of results on this page: ', result_count)
        for i in range(result_count):
            search_results = driver.find_elements_by_xpath("//div[@id='search-results']//div[@id='mainResults']//li")    
            if len(search_results) < 1:
                search_results = driver.find_elements_by_xpath("//ul[contains(@id, 's-results')]//li[contains(@id, 'result')]")
            search_results[i].click()
            prod_brand = get_brand(driver)
            prod_name = get_product(driver)
            prod_price = get_price(driver)
            prod_styles = get_all_swatches(driver) # List
            (prod_review_count, prod_star_rating) = get_no_reviews_rating(driver) # Tuple
            prod_details = get_prod_details(driver) # List of tuples
            prod_details_alt = get_prod_details_alt(driver) # list of lists
            prod_star_breakdown = get_all_stars(driver) # List

            prod_styles_concat = '; '.join(prod_styles)
            prod_details_concat = '; '.join([': '.join(tuple_detail) for tuple_detail in prod_details]).strip() or 'NA'
            print(prod_name)
            prod_details_alt_concat = '; '.join([': '.join(tuple_detail) for tuple_detail in prod_details_alt]).strip() or 'NA'
            try:
                prod_url = driver.current_url.split('?')[0].strip()
            except:
                prod_url = 'NA'
            driver.back()
            string_to_write = ','.join([prod_brand, prod_name, prod_price, prod_styles_concat, prod_review_count, prod_star_rating, prod_details_concat, prod_details_alt_concat, prod_star_breakdown[0], prod_star_breakdown[1], prod_star_breakdown[2], prod_star_breakdown[3], prod_star_breakdown[4], prod_url])
            f.write(string_to_write + '\n')
            time.sleep(1 + random.random() * 3)

        total_scraped += result_count
        print("****Total completed: ", total_scraped)
        time.sleep(5)
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        driver.find_elements_by_xpath("//a[@id='pagnNextLink']")[-1].click()
        time.sleep(5)
    except Exception as e:
        print(e)
        f.close()
        print('ended at: ', total_scraped)
        break


- Number of results on this page:  48
AmazonBasics Hardside Spinner Luggage Black
Rockland Melbourne 20-Inch Expandable Abs Carry On Luggage
Samsonite Winfield 2 Hardside 28" Luggage
Rockland Luggage 2 Piece Set
Samsonite Winfield 2 Fashion Hardside 3 Piece Set
Samsonite Omni PC Hardside 3 Piece Set 20 24 28 Spinner
Samsonite Omni PC Hardside 20-Inch Spinner
AmazonBasics Softside Spinner Luggage
Kenneth Cole Reaction Out of Bounds 20" 4 Wheel Upright
American Tourister Luggage Fieldbrook II 3 Piece Set
Travelpro Crew 11 21" Expandable Spinner Carry-on Suiter Suitcase


KeyboardInterrupt: 

In [14]:
f.close()

In [17]:
driver.back()