# Amazon Data Collection

###### This project is meant to serve as an exercise for web scrapping and database creation. We begin with the retrieval of jean reviews from Amazon listings to create a database for future analysis.

###### After retrieving the data, we will perform data tranformation to ensure each column's use in Tableau visualization. 

In [6]:
# standard library imports
import csv
from datetime import datetime
import re


# third-party imports
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd

## Startup the webdriver

###### Normally an API is used for data acquisition from an external webpage. Unfortunately, Amazon does not offer a webstore API for free. Instead, we will be taking advantage of ChromeDriver, an executable used by Selnium's Webdriver to automate user processes on a browser. ChromeDriver will allow us to open each Amazon webpage, inspect, and analyze its HTML, a slow but admissable solution. Please refer to [this](https://chromedriver.chromium.org/getting-started) article to learn more about ChromeDriver.

###### You can download ChromeDriver [here](https://sites.google.com/chromium.org/driver/downloads?authuser=0).

In [11]:
# create an object to initialize Chromedriver
driver = webdriver.Chrome() # ensure you have downloaded the correct version of ChromeDriver in directory

###### To ensure versatility, the function below takes a search term and generates an Amazon url with search results.

In [12]:
def get_url(search_term):
    """generate a url from search term"""
    template = 'https://www.amazon.com/s?k={}&ref=nb_sb_noss_2' # blank Amazon search url
    search_term = search_term.replace(' ', '+') # reformat spaces for url
    return template.format(search_term) #.format replaces the brackets '{}' with the search term

In [6]:
url = get_url('monitor')
print(url)

https://www.amazon.com/s?k=monitor&ref=nb_sb_noss_2


In [7]:
driver.get(url) # load the webpage for the provided url

## Extract the collection

In [8]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [9]:
# find all div tags that have the component type of a search result
results = soup.find_all('div', {'data-component-type': 's-search-result'})

In [10]:
len(results)

22

In [11]:
item = results[0]

###### This is the first item in our search result. It has a lot more html then we'd expect because of the extra sponsored content inserted


In [12]:
item

<div class="s-result-item s-asin sg-col-0-of-12 sg-col-16-of-20 AdHolder sg-col sg-col-12-of-16" data-asin="B08LP22STD" data-component-id="1" data-component-type="s-search-result" data-index="1" data-uuid="15e882de-2343-48e7-9dc6-d5c36ba671e0"><div class="sg-col-inner">
<span cel_widget_id="MAIN-SEARCH_RESULTS-1" class="celwidget slot=MAIN template=SEARCH_RESULTS widgetId=search-results_1" data-csa-c-id="lgq9go-oww6e-vzl30o-of7833">
<div class="rush-component" data-component-id="2" data-component-props='{"percentageShownToFire":"50","batchable":true,"requiredElementSelector":".s-image","url":"https://www.amazon.com/gp/sponsored-products/logging/log-action.html?qualifier=1629302623&amp;id=1914416886450696&amp;widgetName=sp_atf&amp;adId=200052549831381&amp;eventType=1&amp;adIndex=0"}' data-component-type="s-impression-logger">
<div class="rush-component" data-component-id="3" data-component-type="sp-sponsored-result">
<div class="s-expand-height s-include-content-margin s-latency-cf-sect

In [13]:
# we can retrieve the atag under the h2 tag
atag = item.h2.a

In [14]:
#We retrieve the item's description using the text linked to the hyperlink, strip() removes the extra spaces
description = atag.text.strip()

In [15]:
#Because the link is a hyperlink, we must add the amazon domain to make it a full link
url = 'https://www.amazon.com' + atag.get('href')

In [16]:
#We begin to obtain the item's price by first finding the span tag with the a-price class
price_parent = item.find('span', 'a-price')

In [17]:
#Now that we have the price parent, we can find the span tag within it that has the a-offscreen class
price = price_parent.find('span', 'a-offscreen').text

In [18]:
rating = item.i.text

In [19]:
rating

'4.2 out of 5 stars'

In [20]:
review_count = item.find('span', 'a-size-base').text

## Generalize the pattern

In [21]:
def extract_record(item):
    """Extract and return data from a single record"""
    
    # description and url
    atag = item.h2.a
    description = atag.text.strip()
    url = 'https://www.amazon.com' + atag.get('href')
    
    # Error handlling: we need to handle the attribute error caused by missing data, not every item has a price or rating
    try:
        # price
        price_parent = item.find('span', 'a-price')
        price = price_parent.find('span', 'a-offscreen').text
    except AttributeError:
        return
    
    try:
        # rank and rating
        rating = item.i.text
        review_count = item.find('span', 'a-size-base').text
    except AttributeError:
        rating = ''
        review_count = ''
        
        
    result = (description, price, rating, review_count, url)
    
    return result

## Error handling

In [22]:
records = []
results = soup.find_all('div', {'data-component-type': 's-search-result'})

for item in results:
    record = extract_record(item)
    # this way we only append records that are not empty
    if record:
        records.append(extract_record(item))

In [26]:
records[0]

('Trio Max Portable Monitor for Laptop, Mobile Pixels 14.1" Full HD IPS Display, Dual or Triple Laptop Monitor Screen, USB A/Type-C Plug and Play Monitor for 13”-17” Laptops(1x 14.1 Monitor)',
 '$309.00',
 '4.2 out of 5 stars',
 '442',
 'https://www.amazon.com/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A01603532Z48PVZG5XPUU&url=%2FTrio-Portable-Monitor-14-1-Display%2Fdp%2FB08LP22STD%2Fref%3Dsr_1_1_sspa%3Fdchild%3D1%26keywords%3Dmonitor%26qid%3D1629302623%26sr%3D8-1-spons%26psc%3D1%26smid%3DAC2D8CHU2486J&qualifier=1629302623&id=1914416886450696&widgetName=sp_atf')

In [24]:
for row in records:
    print(row[1])

$309.00
$1,224.00
$129.99
$268.00
$249.99
$296.40
$349.99
$129.97
$104.97
$26.45
$218.00
$636.99
$534.98
$554.99
$199.99
$496.99
$169.99
$196.99
$835.00
$306.32
$19.99
$20.99


## Getting the next page

In [None]:
def get_url(search_term):
    """Generate a url from search term"""
    template = 'https://www.amazon.com/s?k={}&ref=nb_sb_noss_2'
    search_term = search_term.replace(' ', '+')
    
    # add term query to url
    url = template.format(search_term)
    
    # add page query to url
    url += '&page={}'
    
    return url

## Putting it all together

In [29]:
def get_url(search_term):
    """Generate a url from search term"""
    template = 'https://www.amazon.com/s?k={}&ref=nb_sb_noss_2'
    search_term = search_term.replace(' ', '+')
    
    # add term query to url
    url = template.format(search_term)
    
    # add page query to url
    url += '&page={}'
    
    return url

def extract_record(item):
    """Extract and return data from a single record"""
    
    # description and url
    atag = item.h2.a
    description = atag.text.strip()
    url = 'https://www.amazon.com' + atag.get('href')
    
    # Error handlling: we need to handle the attribute error caused by missing data, not every item has a price or rating
    try:
        # price
        price_parent = item.find('span', 'a-price')
        price = price_parent.find('span', 'a-offscreen').text
    except AttributeError:
        return
    
    try:
        # rank and rating
        rating = item.i.text
        review_count = item.find('span', 'a-size-base').text
    except AttributeError:
        rating = ''
        review_count = ''
        
        
    result = (description, price, rating, review_count, url)
    
    return result




In [30]:
def main(search_term):
    """Run main program routine"""
    
    # startup web driver
    driver = driver = webdriver.Chrome(r'C:\Users\marti\Downloads\chromedriver.exe')
    
    records = []
    url = get_url(search_term)
    
    for page in range(1, 21):
        driver.get(url.format(page))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        results = soup.find_all('div', {'data-component-type': 's-search-result'})
        
        for item in results:
            record = extract_record(item)
            if record:
                records.append(record)
                
    driver.close()
    
    # save data to csv file
    with open('results.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Description', 'Price', 'Rating', 'ReviewCount', 'Url'])
        writer.writerows(records) # notice we use "writerows" for multiple rows, this uses a list, as opposed to "writerow" for a single row
        

In [31]:
main('monitor')

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=92.0.4515.131)


In [None]:
# get review text
# get review title
# get review brand, also size and color if possible