# Q10. Amazon.com webpage scraping

1. Go to webpage https://www.amazon.in/
2. Enter “Laptop” in the search field and then click the search icon.
3. Then set CPU Type filter to “Intel Core i7” and “Intel Core i9” as shown in the image.
4. After setting the filters scrape first 10 laptops data.
5. You have to scrape 3 attributes for each laptop:
   1. title
   2. Ratings
   3. Price
6. Finally create a dataframe of the scraped data.

In [1]:
#Importing required libraries
import selenium
import pandas as pd
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException

In [2]:
#Connect to web driver
driver=webdriver.Chrome(r"D://chromedriver.exe")  #r converts string to raw string
#If not r, we can use executable_path = "C:/path name"

In [3]:
#Getting the website to driver
driver.get('https://www.amazon.in/')

#When we run this line, automatically the webpage will be opened

In [4]:
#Searching laptop in the search bar and clicking the search button
search_bar=driver.find_element_by_id('twotabsearchtextbox')
search_bar.send_keys("laptop")

driver.find_element_by_id('nav-search-submit-button').click()

In [5]:
#Locating the filters from the webpage
#Filtering Intel Core i7 from filters
filter1=driver.find_elements_by_xpath("//a[@class='a-link-normal s-navigation-item']/span")
for i in filter1:
    if i.text=='Intel Core i7':
        i.click()
        break

In [6]:
#Filtering Intel Core i9 from filters
filter1=driver.find_elements_by_xpath("//a[@class='a-link-normal s-navigation-item']/span")
for i in filter1:
    if i.text=='Intel Core i9':
        i.click()
        break

In [7]:
#Specifying the url of the webpage to be scraped
url="https://www.amazon.in/s?k=laptop&i=computers&rh=n%3A1375424031%2Cp_n_feature_thirteen_browse-bin%3A12598163031%7C16757432031&dc&qid=1617789939&rnid=12598141031&ref=sr_nr_p_n_feature_thirteen_browse-bin_17"
driver.get(url)

In [8]:
#Extracting the tags having the product title
title=driver.find_elements_by_xpath("//span[@class='a-size-medium a-color-base a-text-normal']")
title

[<selenium.webdriver.remote.webelement.WebElement (session="797a10e1445f0617f4203c4fa811606b", element="f6a39d08-ff84-4ad6-b752-17deb7e48a5d")>,
 <selenium.webdriver.remote.webelement.WebElement (session="797a10e1445f0617f4203c4fa811606b", element="fc9db6ec-5c9f-4eee-9647-2de0b4f2f7f8")>,
 <selenium.webdriver.remote.webelement.WebElement (session="797a10e1445f0617f4203c4fa811606b", element="d99c1c17-a367-4921-b669-019b9a0eaa69")>,
 <selenium.webdriver.remote.webelement.WebElement (session="797a10e1445f0617f4203c4fa811606b", element="40ba4093-714b-456f-8cbf-abe590036067")>,
 <selenium.webdriver.remote.webelement.WebElement (session="797a10e1445f0617f4203c4fa811606b", element="74f113f7-e982-42d4-9f57-71723ee85f44")>,
 <selenium.webdriver.remote.webelement.WebElement (session="797a10e1445f0617f4203c4fa811606b", element="9c3bf750-0e29-456e-b238-4846466908a4")>,
 <selenium.webdriver.remote.webelement.WebElement (session="797a10e1445f0617f4203c4fa811606b", element="9b5aac4d-f052-426f-b6f3-9e

In [9]:
#Extracting the text from the tags
prod_title=[]  #Empty list

#As we need to scrap data for the first 10 product results, we are running a for loop for first 10 results only
for i in title[:10]:
    prod_title.append(i.text)
prod_title    

['(Renewed) HP ZBook 15 G3 Mobile Workstation - 15.6" - Core i7 (6700HQ) - 32 GB RAM - 1TB SSD : BLUETOOTH : 2GB DEDICATED GRAPHIC : NUMERIC KEYBOARD',
 "(Renewed) Lenovo ThinkPad X1 Carbon Laptop (CORE I7 6TH GEN/8GB/256GB SSD/WEBCAM/14''/WIN 10 PRO)",
 "(Renewed) HP EliteBook 820 G4 Laptop (CORE I5 7TH GEN/8GB/512GB SSD/WEBCAM/12.5'' TOUCH/WIN 10 PRO)",
 'Dell Inspiron 5406 14" FHD Touch Display 2in1 Laptop (11th Gen i7-1165G7 / 8Gb / 512Gb SSD / Integrated Graphics / Win 10 + MSO / Titan Grey Color / FPR) D560414WIN9S',
 'HP Pavilion (2021) Thin & Light 11th Gen Core i7 Laptop, 16 GB RAM, 1TB SSD, Iris Xe Graphics, 14-inch FHD Screen, Windows 10, MS Office, Backlit Keyboard',
 'Mi Notebook Horizon Edition 14 Intel Core i5-10210U 10th Gen Thin and Light Laptop(8GB/512GB SSD/Windows 10/Nvidia MX350 2GB Graphics/Grey/1.35Kg), XMA1904-AR+Webcam',
 '(Renewed) HP Pavilion 15-Cs1002txin 2018 15.6-Inch Laptop (8th Gen Core I7-8565u/8GB/1TB/Windows 10 Home/4GB Nvidia Geforce Mx150 Graphics)'

In [10]:
#Extracting the tags having the price of the product
price=driver.find_elements_by_xpath("//span[@class='a-price']")
price

[<selenium.webdriver.remote.webelement.WebElement (session="797a10e1445f0617f4203c4fa811606b", element="f36a42eb-108a-47d1-91d0-e63bccc6b1da")>,
 <selenium.webdriver.remote.webelement.WebElement (session="797a10e1445f0617f4203c4fa811606b", element="19129940-84f3-4758-ae04-bb8d84eda4f4")>,
 <selenium.webdriver.remote.webelement.WebElement (session="797a10e1445f0617f4203c4fa811606b", element="5276b6c2-fa03-47cc-afd2-9c68b878d986")>,
 <selenium.webdriver.remote.webelement.WebElement (session="797a10e1445f0617f4203c4fa811606b", element="9c33fbc8-d26a-4d6c-9775-38b5eb525b73")>,
 <selenium.webdriver.remote.webelement.WebElement (session="797a10e1445f0617f4203c4fa811606b", element="a28e653f-b259-47ae-b269-3791ef781370")>,
 <selenium.webdriver.remote.webelement.WebElement (session="797a10e1445f0617f4203c4fa811606b", element="82da3316-0a36-492a-a6f2-2cb57b2be2cc")>,
 <selenium.webdriver.remote.webelement.WebElement (session="797a10e1445f0617f4203c4fa811606b", element="57245cf4-3783-4895-9987-23

In [11]:
#Extracting the text from the tags
prod_price=[]  #Empty list

#As we need to scrap data for the first 10 product results, we are running a for loop for first 10 results only
for i in price[:10]:
    prod_price.append(i.text)
prod_price

['₹86,990',
 '₹55,290',
 '₹40,790',
 '₹86,200',
 '₹79,990',
 '₹54,999',
 '₹68,490',
 '₹1,35,490',
 '₹47,190',
 '₹76,500']

In [12]:
#Extracting the tags having the product ratings
#First we will collect the urls of all laptops
laptop_urls=driver.find_elements_by_xpath("//a[@class='a-link-normal a-text-normal']")
URL=[]   #Taking an empty list

#Appending the url of first 10 laptops to empty list
for i in laptop_urls[:10]:
    URL.append(i.get_attribute('href'))   #Getting url alone
URL    

['https://www.amazon.in/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_computers_sr_pg1_1?ie=UTF8&adId=A09956592AR8PX5LFZ4HI&url=%2FRenewed-HP-ZBook-Mobile-Workstation%2Fdp%2FB0912CL6PC%2Fref%3Dsr_1_1_sspa%3Fdchild%3D1%26keywords%3Dlaptop%26qid%3D1617794191%26refinements%3Dp_n_feature_thirteen_browse-bin%253A12598163031%257C16757432031%26rnid%3D12598141031%26s%3Dcomputers%26sr%3D1-1-spons%26psc%3D1&qualifier=1617794191&id=7007039792524293&widgetName=sp_atf',
 'https://www.amazon.in/Renewed-Lenovo-ThinkPad-X1-Carbon/dp/B08XVM4XB5/ref=sr_1_2?dchild=1&keywords=laptop&qid=1617794191&refinements=p_n_feature_thirteen_browse-bin%3A12598163031%7C16757432031&rnid=12598141031&s=computers&sr=1-2',
 'https://www.amazon.in/Renewed-HP-EliteBook-820-Laptop/dp/B091GQFCLK/ref=sr_1_3?dchild=1&keywords=laptop&qid=1617794191&refinements=p_n_feature_thirteen_browse-bin%3A12598163031%7C16757432031&rnid=12598141031&s=computers&sr=1-3',
 'https://www.amazon.in/Dell-Inspiron-i7-1165G7-Integrated-D560414WIN9S

In [13]:
#Extracting the ratings of the laptop by using exception as some products dont have any ratings
Ratings=[]   #Empty list
#Loop for every laptops in the list
for url in URL:
    driver.get(url)
    try:   #Exception handling by using NoSuchElementException
        prod_rating=driver.find_element_by_id('acrCustomerReviewText')  #Locating the rating link
        prod_rating.click()
        rating=driver.find_element_by_xpath("//span[@class='a-size-medium a-color-base']") #Locating the rating tags
        Ratings.append(rating.text)  #Appending the text from tags to the list
    except NoSuchElementException as e:
        Ratings.append('No Rating')  #Appending message for products having no ratings

In [14]:
#Checking out the length of the data extracted
print(len(prod_title),len(prod_price),len(Ratings))

10 10 10


In [15]:
#Creating a new dataframe for saving the data
amazon=pd.DataFrame({})
amazon['Product name']=prod_title
amazon['Price']=prod_price
amazon['Rating']=Ratings
amazon

Unnamed: 0,Product name,Price,Rating
0,(Renewed) HP ZBook 15 G3 Mobile Workstation - ...,"₹86,990",No Rating
1,(Renewed) Lenovo ThinkPad X1 Carbon Laptop (CO...,"₹55,290",No Rating
2,(Renewed) HP EliteBook 820 G4 Laptop (CORE I5 ...,"₹40,790",No Rating
3,"Dell Inspiron 5406 14"" FHD Touch Display 2in1 ...","₹86,200",No Rating
4,HP Pavilion (2021) Thin & Light 11th Gen Core ...,"₹79,990",4.3 out of 5
5,Mi Notebook Horizon Edition 14 Intel Core i5-1...,"₹54,999",4.3 out of 5
6,(Renewed) HP Pavilion 15-Cs1002txin 2018 15.6-...,"₹68,490",No Rating
7,"Lenovo Legion 5Pi 10th Gen Intel Core i7 15.6""...","₹1,35,490",4.3 out of 5
8,(Renewed) Lenovo Thinkpad Yoga S1 Laptop (CORE...,"₹47,190",1 out of 5
9,HP 14 Thin & Light 14-inch FHD Laptop (11th Ge...,"₹76,500",4.6 out of 5


In [16]:
#Saving the data into a csv file
amazon.to_csv("Amazon_Laptops.csv")

In [17]:
#Closing the driver
driver.close()