In [4]:
import csv
from bs4 import BeautifulSoup
from selenium import webdriver

## Startup the webdrive

In [5]:
driver = webdriver.Chrome()

In [6]:
url = 'https://www.amazon.com/'
driver.get(url)

In [7]:
def get_url(search_term):
    template = 'https://www.amazon.com/s?k={}&crid=1YOA34FVDWY3Y'
    search_term = search_term.replace(' ', '+')
    return template.format(search_term)

In [15]:
url = get_url('ultrawide monitor')
print(url)

https://www.amazon.com/s?k=ultrawide+monitor&crid=1YOA34FVDWY3Y


## Extract the collection

In [18]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [19]:
results = soup.find_all('div', {'data-component-type': 's-search-result'})
len(results)

22

## Prototype the record

In [20]:
item = results[0]


In [21]:
atag = item.h2.a

In [22]:
description = atag.text.strip()

In [23]:
url = 'https://www.amazon.com/'+atag.get('href')

In [24]:
price_parent = item.find('span', 'a-price')

In [25]:
price = price_parent.find('span', 'a-offscreen').text

In [26]:
rating = item.i.text

In [27]:
review_count = item.find('span', {'class': 'a-size-base'}).text

## Generalize the pattern

In [28]:
def extract_record(item):
    """Extract and return data from a single record"""

    # description and url
    atag = item.h2.a
    description = atag.text.strip()
    url = 'https://www.amazon.com' + atag.get('href')

    try:
        # price
        price_parent = item.find('span', 'a-price')
        price = price_parent.find('span', 'a-offscreen').text
    except AttributeError:
        return

    try:
        # rank and rating
        rating = item.i.text
        review_count = item.find('span', {'class': 'a-size-base'}).text
    except AttributeError:
        rating = ''
        review_count = ''

    result = (description, price, rating, review_count, url)

    return result

In [29]:
records = []
results = soup.find_all('div', {'data-component-type': 's-search-result'})

for item in results:
    record = extract_record(item)
    if record:
        records.append(record)

In [30]:
records[0]

('INNOCN 40C1R Ultrawide Monitor 40" WQHD 3440 x 1440p 144Hz Monitor AMD FreeSync Premium HDR400 21:9 Computer Monitor 95% DCI-P3 500Nits IPS USB Type-C HDMI Tilt/Height Adjustable Monitor, Mountable',
 '$399.99',
 '4.3 out of 5 stars',
 '4.3',
 'https://www.amazon.com/sspa/click?ie=UTF8&spc=MTo3NzY1NzMxODkzMjE0NTkzOjE2OTIzMTIzODI6c3BfYXRmOjIwMDA1NDYyMjEyMTA5ODo6MDo6&url=%2FINNOCN-Ultrawide-Monitor-FreeSync-Premium%2Fdp%2FB09N3G9T16%2Fref%3Dsr_1_1_sspa%3Fcrid%3DTI8IOVYFNE3P%26keywords%3Dultrawide%2Bmonitor%26qid%3D1692312382%26sprefix%3Dultrawide%2Bmonitor%252Caps%252C84%26sr%3D8-1-spons%26sp_csd%3Dd2lkZ2V0TmFtZT1zcF9hdGY%26psc%3D1')

In [31]:
for row in records:
    print(row[1])

$399.99
$340.99
$184.01
$349.00
$219.99
$847.62
$149.99
$219.97
$999.99
$392.00
$369.99
$499.99
$299.97
$189.23
$1,496.99
$319.99
$129.99
$1,154.99
$486.24
$199.00
$399.99
$299.97


## Getting the next page

In [32]:
def get_url(search_term):
    template = 'https://www.amazon.com/s?k={}&crid=1YOA34FVDWY3Y'
    search_term = search_term.replace(' ', '+')
    
    # add term query to url
    url = template.format(search_term)

    # add page query placeholder
    url += '&page={}'

    return url

## Putting it all together

In [4]:
import csv
from bs4 import BeautifulSoup
from selenium import webdriver

def get_url(search_term):
    template = 'https://www.amazon.com/s?k={}&crid=1YOA34FVDWY3Y'
    search_term = search_term.replace(' ', '+')
    url = template.format(search_term)
    url += '&page={}'
    return url

def extract_record(item):
    """Extract and return data from a single record"""
    try:
        atag = item.h2.a
        description = atag.text.strip()
        url = 'https://www.amazon.com' + atag.get('href')

        price_parent = item.find('span', 'a-price')
        price = price_parent.find('span', 'a-offscreen').text

        rating = item.i.text
        review_count = item.find('span', {'class': 'a-size-base'}).text
    except AttributeError as e:
        print("Error while extracting record:", e)
        return

    result = (description, price, rating, review_count, url)
    return result

def main(search_term):
    """Run main program routine"""
    try:
        driver = webdriver.Chrome()
        records = []
        url = get_url(search_term)

        for page in range(1, 21):
            print(f"Fetching page {page} ...")
            driver.get(url.format(page))
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            results = soup.find_all('div', {'data-component-type': 's-search-result'})

            for item in results:
                record = extract_record(item)
                if record:
                    records.append(record)

        driver.close()

        # Save the data to csv file
        with open('amazon_creatine_081723.csv', 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(['Description', 'Price', 'Rating', 'ReviewCount', 'Url'])
            writer.writerows(records)
        print("Data saved successfully.")
    except Exception as e:
        print(f"An error occurred: {e}")

In [5]:
main("creatine")

Fetching page 1 ...
Fetching page 2 ...
Fetching page 3 ...
Fetching page 4 ...
Fetching page 5 ...
Error while extracting record: 'NoneType' object has no attribute 'text'
Error while extracting record: 'NoneType' object has no attribute 'text'
Error while extracting record: 'NoneType' object has no attribute 'text'
Error while extracting record: 'NoneType' object has no attribute 'text'
Fetching page 6 ...
Error while extracting record: 'NoneType' object has no attribute 'text'
Error while extracting record: 'NoneType' object has no attribute 'text'
Fetching page 7 ...
Error while extracting record: 'NoneType' object has no attribute 'text'
Fetching page 8 ...
Fetching page 9 ...
Fetching page 10 ...
Fetching page 11 ...
Fetching page 12 ...
Fetching page 13 ...
Error while extracting record: 'NoneType' object has no attribute 'text'
Fetching page 14 ...
Fetching page 15 ...
Fetching page 16 ...
Fetching page 17 ...
Error while extracting record: 'NoneType' object has no attribute 't