# Amazon web scraper

In [1]:
import csv
from bs4 import BeautifulSoup

In [2]:
from selenium import webdriver

# Starting up the web driver

In [3]:
driver = webdriver.Chrome()

In [4]:
url = 'https://www.amazon.in'
driver.get(url)

In [5]:
def get_url(search_term):
    template = 'https://www.amazon.in/s?k={}&crid=20LK9C77DTUF2&sprefix={}%2Caps%2C225&ref=nb_sb_noss_1'
    search_term = search_term.replace(' ', '+')
    return template.format(search_term, search_term)

In [6]:
url = get_url('ultrawide monitors')
print(url)

https://www.amazon.in/s?k=ultrawide+monitors&crid=20LK9C77DTUF2&sprefix=ultrawide+monitors%2Caps%2C225&ref=nb_sb_noss_1


In [7]:
driver.get(url)

# Extracting the data

In [8]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [9]:
results = soup.find_all('div', {'data-component-type' : 's-search-result'})
len(results)

22

# Prototype the record

In [10]:
item = results[0]


In [11]:
atag = item.h2.a

In [12]:
description = atag.text.strip()

In [13]:
url = 'https://www.amazon.in' + atag.get('href')

In [14]:
price_parent = item.find('span', 'a-price')
price = price_parent.find('span', 'a-offscreen').text

In [15]:
rating = item.i.text

In [16]:
review_count = item.find('span', {'class': 'a-size-base s-underline-text'}).text



# Generalize the pattern

In [17]:
def extract_record(item):
    atag = item.h2.a
    description = atag.text.strip()
    url = 'https://www.amazon.in' + atag.get('href')
    
    price_parent = item.find('span', 'a-price')
    price = price_parent.find('span', 'a-offscreen').text
    rating = item.i.text

    review_count = item.find('span', {'class': 'a-size-base s-underline-text'}).text
    
    result = (description, url, price, rating, review_count)
    return result

In [18]:
records = []
results = soup.find_all('div', {'data-component-type' : 's-search-result'})

for item in results:
    records.append(extract_record(item))

AttributeError: 'NoneType' object has no attribute 'find'

# Error handling

In [18]:
def extract_record(item):
    atag = item.h2.a
    description = atag.text.strip()
    url = 'https://www.amazon.in' + atag.get('href')
    
    try:
        price_parent = item.find('span', 'a-price')
        price = price_parent.find('span', 'a-offscreen').text
    except AttributeError:
        return
    try:
        rating = item.i.text

        review_count = item.find('span', {'class': 'a-size-base s-underline-text'}).text
    except AttributeError:
        rating = ''
        review_count = ''
    result = (description, url, price, rating, review_count)
    return result

In [19]:
records = []
results = soup.find_all('div', {'data-component-type' : 's-search-result'})

for item in results:
    record = extract_record(item)
    if record:
        records.append(record)
        
 

In [20]:
records[8]

('Acer CB342CK 34 Inch (86.36 Cm) IPS Ultrawide (21:9) QHD 3440 X 1440 LCD Monitor with LED Backlight I AMD Radeon Freesync I HDR Ready, 1MS VRB I 75Hz Refresh | Pivot I Eye Care Features, (Silver)',
 'https://www.amazon.in/Acer-CB342CK-UltraWide-FREESYNC-Technology/dp/B09LD6HV1Q/ref=sr_1_9?crid=20LK9C77DTUF2&keywords=ultrawide+monitors&qid=1672488685&sprefix=ultrawide+monitors%2Caps%2C225&sr=8-9',
 '₹28,990',
 '4.2 out of 5 stars',
 '(131)')

In [21]:
for row in records:
    print(row[2])

₹35,777
₹38,333
₹17,100
₹17,999
₹21,990
₹27,711
₹20,888
₹20,177
₹28,990
₹28,900
₹73,600
₹8,349
₹45,890
₹38,333
₹17,480
₹39,926
₹69,990
₹15,990
₹17,199
₹7,999


# Getting the next page

In [22]:
def get_url(search_term):
    template = 'https://www.amazon.in/s?k={}&crid=20LK9C77DTUF2&sprefix={}%2Caps%2C225&ref=nb_sb_noss_1'
    search_term = search_term.replace(' ', '+')
    url = template.format(search_term, search_term)
    
    url += '&page={}'
    
    return url

# Putting it all together

In [27]:
import csv
from bs4 import BeautifulSoup

from selenium import webdriver

def get_url(search_term):
    template = 'https://www.amazon.in/s?k={}&crid=20LK9C77DTUF2&sprefix={}%2Caps%2C225&ref=nb_sb_noss_1'
    search_term = search_term.replace(' ', '+')
    url = template.format(search_term, search_term)
    
    url += '&page={}'
    
    return url

def extract_record(item):
    atag = item.h2.a
    description = atag.text.strip()
    url = 'https://www.amazon.in' + atag.get('href')
    
    try:
        price_parent = item.find('span', 'a-price')
        price = price_parent.find('span', 'a-offscreen').text
    except AttributeError:
        return
    try:
        rating = item.i.text

        review_count = item.find('span', {'class': 'a-size-base s-underline-text'}).text
    except AttributeError:
        rating = ''
        review_count = ''
    result = (description, url, price, rating, review_count)
    return result

def main(search_term):
    
    #startup the web driver
    
    driver = webdriver.Chrome()
    
    records = []
    url = get_url(search_term)
    
    for page in range(1, 21):
        driver.get(url.format(page))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        results = soup.find_all('div', {'data-component-type' : 's-search-result'})
        
        for item in results:
            record = extract_record(item)
            if record:
                records.append(record)
    
    
    driver.close()
    
    #saving the results to a csv file
    
    with open('results.csv', 'w', newline='', encoding='UTF8') as f:
        writer = csv.writer(f)
        writer.writerow(['Description', 'URL', 'Price', 'Rating', 'Review Count'])
        writer.writerows(records)
            
        

In [28]:
main('ultrawide monitors')