In [1]:
import csv
from bs4 import BeautifulSoup
from selenium import webdriver

## Startup the webdrive

In [2]:
driver = webdriver.Chrome()

In [3]:
url = 'https://www.amazon.com/'
driver.get(url)

In [4]:
def get_url(search_term):
    template = 'https://www.amazon.com/s?k={}&crid=1YOA34FVDWY3Y'
    search_term = search_term.replace(' ', '+')
    return template.format(search_term)

In [5]:
url = get_url('ultrawide monitor')
print(url)

https://www.amazon.com/s?k=ultrawide+monitor&crid=1YOA34FVDWY3Y


## Extract the collection

In [39]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [40]:
results = soup.find_all('div', {'data-component-type': 's-search-result'})
len(results)

22

## Prototype the record

In [41]:
item = results[0]


In [42]:
atag = item.h2.a

In [43]:
description = atag.text.strip()

In [44]:
url = 'https://www.amazon.com/'+atag.get('href')

In [45]:
price_parent = item.find('span', 'a-price')

In [46]:
price = price_parent.find('span', 'a-offscreen').text

In [47]:
rating = item.i.text

In [51]:
review_count = item.find('span', {'class': 'a-size-base'}).text

'(18.5K+)'

## Generalize the pattern

In [52]:
def extract_record(item):
    """Extract and return data from a single record"""

    # description and url
    atag = item.h2.a
    description = atag.text.strip()
    url = 'https://www.amazon.com' + atag.get('href')

    try:
        # price
        price_parent = item.find('span', 'a-price')
        price = price_parent.find('span', 'a-offscreen').text
    except AttributeError:
        return

    try:
        # rank and rating
        rating = item.i.text
        review_count = item.find('span', {'class': 'a-size-base'}).text
    except AttributeError:
        rating = ''
        review_count = ''

    result = (description, price, rating, review_count, url)

    return result

In [53]:
records = []
results = soup.find_all('div', {'data-component-type': 's-search-result'})

for item in results:
    record = extract_record(item)
    if record:
        records.append(record)

In [54]:
records[0]

('Sceptre Curved 24-inch Gaming Monitor 1080p R1500 98% sRGB HDMI x2 VGA Build-in Speakers, VESA Wall Mount Machine Black (C248W-1920RN Series)',
 '$89.97',
 '4.6 out of 5 stars',
 '(18.5K+)',
 'https://www.amazon.com/sspa/click?ie=UTF8&spc=MTo3MDQ4NDU2MDA1ODU3NTkzOjE2OTIyOTczNTQ6c3BfYXRmOjIwMDE2NTE0Njc5Njg5ODo6MDo6&url=%2FSceptre-Curved-Monitor-Speakers-C248W-1920RN%2Fdp%2FB07KXSR99Y%2Fref%3Dsr_1_1_sspa%3Fkeywords%3Dultrawide%2Bmonitor%26qid%3D1692297354%26sprefix%3D%252Caps%252C127%26sr%3D8-1-spons%26sp_csd%3Dd2lkZ2V0TmFtZT1zcF9hdGY%26psc%3D1')

In [55]:
for row in records:
    print(row[1])

$89.97
$249.99
$184.01
$349.00
$219.99
$219.99
$199.98
$218.98
$847.62
$392.00
$999.99
$149.99
$369.99
$299.97
$189.23
$441.12
$499.99
$159.99
$399.99
$129.99
$500.70
$749.99


## Getting the next page

In [57]:
def get_url(search_term):
    template = 'https://www.amazon.com/s?k={}&crid=1YOA34FVDWY3Y'
    search_term = search_term.replace(' ', '+')
    
    # add term query to url
    url = template.format(search_term)

    # add page query placeholder
    url += '&page={}'

    return url

## Putting it all together

In [59]:
import csv
from bs4 import BeautifulSoup
from selenium import webdriver

def get_url(search_term):
    template = 'https://www.amazon.com/s?k={}&crid=1YOA34FVDWY3Y'
    search_term = search_term.replace(' ', '+')
    
    # add term query to url
    url = template.format(search_term)

    # add page query placeholder
    url += '&page={}'

    return url

def extract_record(item):
    """Extract and return data from a single record"""

    # description and url
    atag = item.h2.a
    description = atag.text.strip()
    url = 'https://www.amazon.com' + atag.get('href')

    try:
        # price
        price_parent = item.find('span', 'a-price')
        price = price_parent.find('span', 'a-offscreen').text
    except AttributeError:
        return

    try:
        # rank and rating
        rating = item.i.text
        review_count = item.find('span', {'class': 'a-size-base'}).text
    except AttributeError:
        rating = ''
        review_count = ''

    result = (description, price, rating, review_count, url)

    return result

def main(search_term):
    """Run main program routine"""
    # Startup the webdriver
    driver = webdriver.Chrome()

    record = []
    url = get_url(search_term)

    for page in range(1, 21):
        driver.get(url.format(page))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        results = soup.find_all('div', {'data-component-type': 's-search-result'})

        for item in results:
            record = extract_record(item)
            if record:
                records.append(record)

    driver.close()

# Save the data to csv file
with open('amazonmultiple.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['Description', 'Price', 'Rating', 'ReviewCount', 'Url'])
    writer.writerows(records)

In [60]:
main("ultrawide monitor")