# Amazon Web Scraper by Lidor ES

In [61]:
import csv
from bs4 import BeautifulSoup
from selenium import webdriver

## Create an instance of the websdriver

In [63]:
driver = webdriver.Chrome()

In [3]:
url = 'https://www.amazon.com'
driver.get(url)

In [4]:
def get_url(search_term):
    """Generate a url from search_term"""
    template = 'https://www.amazon.com/s?k={}&ref=nb_sb_noss_1'
    search_term = search_term.replace(' ', '+')
    return template.format(search_term)

In [64]:
url = get_url('ultrawide monitor')
driver.get(url)
print(url)

https://www.amazon.com/s?k=ultrawide+monitor&page={}&ref=nb_sb_noss_1


## Extract the collection

In [7]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [8]:
results = soup.find_all('div', {'data-component-type': 's-search-result'})

## Prototype the record

In [10]:
item = results[0]

In [11]:
atag = item.h2.a

In [13]:
description = atag.text.strip()

In [15]:
url = 'https://www.amazon.com' + atag.get('href')

In [16]:
price_parent = item.find('span', 'a-price')

In [19]:
price = price_parent.find('span', 'a-offscreen').text

In [23]:
rating = item.i.text

In [27]:
review_count = item.find('span', {'class': 'a-size-base'}).text

In [33]:
image_src = item.find('img', {'class': 's-image'})['src']

## Generalize the pattern

In [35]:
def extract_record(_item):
    """Extract and return data from a single record"""
    # description and url
    atag = _item.h2.a
    description = atag.text.strip()
    url = 'https://www.amazon.com' + atag.get('href')
    # Price
    price_parent = _item.find('span', 'a-price')
    price = price_parent.find('span', 'a-offscreen').text
    # Rank and rating
    rating = _item.i.text
    review_count = _item.find('span', {'class': 'a-size-base'}).text
    # Image src url
    image_src = _item.find('img', {'class': 's-image'})['src']
    
    result = (description, price, rating, review_count, url, image_src)
    return result

In [36]:
records = []
results = results = soup.find_all('div', {'data-component-type': 's-search-result'})

for item in results:
    records.append(extract_record(item))

AttributeError: 'NoneType' object has no attribute 'find'

## Error Handling

In [37]:
def extract_record(_item):
    """Extract and return data from a single record"""
    # description and url
    atag = _item.h2.a
    description = atag.text.strip()
    url = 'https://www.amazon.com' + atag.get('href')
    try:
        # Price
        price_parent = _item.find('span', 'a-price')
        price = price_parent.find('span', 'a-offscreen').text
    except AttributeError:
        return
    try:
        # Rank and rating
        rating = _item.i.text
        review_count = _item.find('span', {'class': 'a-size-base'}).text
    except AttributeError:
        rating = ''
        review_count = ''
    # Image src url
    image_src = _item.find('img', {'class': 's-image'})['src']
    
    result = (description, price, rating, review_count, url, image_src)
    return result

In [39]:
records = []
results = results = soup.find_all('div', {'data-component-type': 's-search-result'})

for item in results:
    record = extract_record(item)
    if record:
        records.append(record)

In [41]:
records[0]

('LG 29WN600-W 29" 21:9 UltraWide WFHD IPS HDR10 Monitor with FreeSync, Silver',
 '$226.99',
 '4.6 out of 5 stars',
 '1,704',
 'https://www.amazon.com/LG-29WN600-W-29-21-UltraWide/dp/B0876DBCBX/ref=sr_1_1?dchild=1&keywords=ultrawide+monitor&qid=1621236220&sr=8-1',
 'https://m.media-amazon.com/images/I/91WlgTJfawL._AC_UY218_.jpg')

## Getting the next page

In [52]:
def get_url(search_term):
    """Generate a url from search_term"""
    template = 'https://www.amazon.com/s?k={}{}&ref=nb_sb_noss_1'
    search_term = search_term.replace(' ', '+')
    # Add term query to url
    url = template.format(search_term, '{}')
    # Add page query placeholder
#     url += '&page{}'
    url = url.format('&page{}')
    return url
get_url('ultrawide monitor')

'https://www.amazon.com/s?k=ultrawide+monitor&page{}&ref=nb_sb_noss_1'

# Putting it all togehter

In [81]:
import csv
from bs4 import BeautifulSoup
from selenium import webdriver

def get_url(search_term):
    """Generate a url from search_term"""
    template = 'https://www.amazon.com/s?k={}{}&ref=nb_sb_noss_1'
    search_term = search_term.replace(' ', '+')
    # Add term query to url
    url = template.format(search_term, '{}')
    # Add page query placeholder
    url = url.format('&page={}')
    return url

def extract_record(_item):
    """Extract and return data from a single record"""
    # description and url
    atag = _item.h2.a
    description = atag.text.strip()
    url = 'https://www.amazon.com' + atag.get('href')
    try:
        # Price
        price_parent = _item.find('span', 'a-price')
        price = price_parent.find('span', 'a-offscreen').text
    except AttributeError:
        return
    try:
        # Rank and rating
        rating = _item.i.text
        review_count = _item.find('span', {'class': 'a-size-base'}).text
    except AttributeError:
        rating = ''
        review_count = ''
    # Image src url
    image_src = _item.find('img', {'class': 's-image'})['src']
    
    result = (description, price, rating, review_count, url, image_src)
    return result

def main(search_term):
    """Run main program routine"""
    # Create an instance of the web driver
    driver = webdriver.Chrome()
    
    records = []
    url = get_url(search_term)
    
    for page in range(1, 22):
        driver.get(url.format(page))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        results = soup.find_all('div', {'data-component-type': 's-search-result'})
        for item in results:
            record = extract_record(item)
            if record:
                records.append(record)
        max_pages = soup.find_all('li', {'class': 'a-disabled', 'aria-disabled': 'true'})
        max_page = 0
        for p in max_pages:
            try:
                max_page = int(p.text)
            except ValueError:
                continue
        to_break = False
        next_page = (int(str(page)) + 1)
        if max_page < next_page:
            to_break = True
        if to_break:
            break
    driver.close()
    # Save data to a CSV file
    file_name = 'AmazoneScraper - {}.csv'.format(search_term)
    with open(file_name, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Description', 'Price', 'Rating', 'ReviewCount', 'Url', 'ImageUrl'])
        writer.writerows(records)

In [82]:
main('ultrawide monitor')