# Amazon Web Scraping Computer Monitors

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.common.by import By

In [3]:
base_URL = "https://www.amazon.in/s?i=computers&rh=n%3A1375425031%2Cp_n_feature_twenty-one_browse-bin%3A65987097031%7C65987099031%7C65987100031%7C65987101031%7C65987102031%7C65987104031%7C65987105031&dc&fs=true&qid=1713963213&rnid=65987095031&ref=sr_pg_1"
my_header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0"}

In [4]:
response = requests.get(base_URL, headers = my_header)
soup = BeautifulSoup(response.text, 'html.parser')

In [5]:
product_name_list = []
product_price_list = []
product_mrp_price_list = []
review_count_list = []
ratings_list = []
link_list = []

In [6]:
product_name = soup.find_all(class_ = "a-size-base-plus")
for name in product_name:
    product_name_list.append(name.get_text())

In [7]:
product_price = soup.select("div.a-section.a-spacing-base")
for item_price in product_price:
    span = item_price.select_one("span.a-price-whole")
    if span is None:
        product_price_list.append('Does not exist')
    else:
        product_price_list.append(span.get_text())

In [8]:
mrp_price = soup.select("div.a-section.a-spacing-base")
for price in mrp_price: 
    span = price.select_one("div.a-section.aok-inline-block")
    if span is None:
        product_mrp_price_list.append('Does not exist')
    else:
        product_mrp_price_list.append(span.get_text())

In [19]:
ratings = soup.select("div.a-section.a-spacing-base")
for rating in ratings:
    span = rating.select_one("span.a-icon-alt")
    if span is None:
        ratings_list.append('Does not exist')
    elif span.get_text() != '4 Stars & Up' and span.get_text() != '3 Stars & Up' and span.get_text() != '2 Stars & Up' and span.get_text() != '1 Star & Up':
        ratings_list.append(span.get_text())

In [20]:
review_count = soup.select("div.a-section.a-spacing-base")
for count in review_count:
    span = count.select_one("span.a-size-base.s-underline-text")
    if span is None:
        review_count_list.append('Does not exist')
    else:
        review_count_list.append(span.get_text())

In [9]:
links = soup.select("div.a-section.a-spacing-base")
for link in links:
    anchor_elements = link.select_one('a.a-link-normal.s-underline-text.s-underline-link-text.s-link-style.a-text-normal')
    href=anchor_elements.get("href") # type: ignore
    link_list.append(href)


In [22]:
page_num_extract = soup.select("span.s-pagination-item.s-pagination-disabled")
page_numbers = [num.get_text() for num in page_num_extract[1:]]

In [23]:
final_num = page_numbers[0]
for page in tqdm(range(2, int(final_num) + 1)):
    URL = f"https://www.amazon.in/s?i=computers&rh=n%3A1375425031%2Cp_n_feature_twenty-one_browse-bin%3A65987097031%7C65987099031%7C65987100031%7C65987101031%7C65987102031%7C65987104031%7C65987105031&dc&fs=true&page={page}&qid=1713963328&rnid=65987095031&ref=sr_pg_{page}"

    response = requests.get(URL, headers = my_header)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    product_name = soup.find_all(class_="a-size-base-plus")
    for name in product_name:
        product_name_list.append(name.get_text())

    product_price = soup.select("div.a-section.a-spacing-base")
    for item_price in product_price:
        span = item_price.select_one("span.a-price-whole")
        if span is None:
            product_price_list.append('Does not exist')
        else:
            product_price_list.append(span.get_text())

    mrp_price = soup.select("div.a-section.a-spacing-base")
    for price in mrp_price: 
        span = price.select_one("div.a-section.aok-inline-block")
        if span is None:
            product_mrp_price_list.append('Does not exist')
        else:
            product_mrp_price_list.append(span.get_text())

    review_count = soup.select("div.a-section.a-spacing-base")
    for count in review_count:
        span = count.select_one("span.a-size-base.s-underline-text")
        if span is None:
            review_count_list.append('Does not exist')
        else:
            review_count_list.append(span.get_text())

    ratings = soup.select("div.a-section.a-spacing-base")
    for rating in ratings:
        span = rating.select_one("span.a-icon-alt")
        if span is None:
            ratings_list.append('Does not exist')
        elif span.get_text() != '4 Stars & Up' and span.get_text() != '3 Stars & Up' and span.get_text() != '2 Stars & Up' and span.get_text() != '1 Star & Up':
            ratings_list.append(span.get_text())

    links = soup.select("div.a-section.a-spacing-base")
    for link in links:
        anchor_elements = link.select_one('a.a-link-normal.s-underline-text.s-underline-link-text.s-link-style.a-text-normal')
        href=anchor_elements.get("href") # type: ignore
        link_list.append(href)

  0%|          | 0/29 [00:00<?, ?it/s]

100%|██████████| 29/29 [00:45<00:00,  1.55s/it]


In [24]:
resolution_list = []
screen_size_list = []
brand_list = []
refresh_rate_list = []
monitor_category_list = []
item_model_list = []

In [25]:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.common.by import By
driver = webdriver.Edge()
wait = WebDriverWait(driver, 10)
for i in tqdm(link_list):
    driver.get(f'https://www.amazon.in/{i}')
    
    # Get resolution
    try:
        resolution_element = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="productOverview_feature_div"]/div/table')))
        resolution = next((line for line in resolution_element.text.split('\n') if 'Display Resolution Maximum' in line), None)
        resolution_list.append(resolution if resolution else 'None')
    except:
        resolution_list.append('None')

    # Get screen size
    try:
        screen_size_element = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="productOverview_feature_div"]/div/table')))
        screen_size = next((line for line in screen_size_element.text.split('\n') if 'Screen Size' in line), None)
        screen_size_list.append(screen_size if screen_size else 'None')
    except:
        screen_size_list.append('None')  

    # Get brand
    try:
        brand_element = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="productOverview_feature_div"]/div/table')))
        brand = next((line for line in brand_element.text.split('\n') if 'Brand' in line), None)
        brand_list.append(brand if brand else 'None')
    except:
        brand_list.append('None') 

    # Get refresh rate
    try:
        refresh_rate_element = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="productOverview_feature_div"]/div/table')))
        refresh_rate = next((line for line in refresh_rate_element.text.split('\n') if 'Refresh Rate' in line), None)
        refresh_rate_list.append(refresh_rate if refresh_rate else 'None')
    except:
        refresh_rate_list.append('None') 

    # Get monitor category
    try:
        monitor_category_element = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="prodDetails"]/div/div[2]')))
        generic_name = next((line for line in monitor_category_element.text.split('\n') if 'Generic Name' in line), None)
        monitor_category_list.append(generic_name if generic_name else 'None')
    except:
        monitor_category_list.append('None')

    # Get item model
    try:
        item_model_element = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="productDetails_techSpec_section_1"]')))
        item_model_number = next((line for line in item_model_element.text.split('\n') if 'Item model number' in line), None)
        item_model_list.append(item_model_number if item_model_number else 'None')
    except:
        item_model_list.append('None')

driver.quit()

100%|██████████| 907/907 [1:09:20<00:00,  4.59s/it]


In [None]:
df = pd.DataFrame({'Product_Name': product_name_list, 'Product_Price': product_price_list, 'Product_MRP_Price': product_mrp_price_list,
                  'Ratings': ratings_list, 'Review_Count': review_count_list, 'Screen Resolution': resolution_list, 'Screen_Size': screen_size_list, 'Brand_Name': brand_list, 'Refresh_Rate': refresh_rate_list,  'Monitor_Category': monitor_category_list, 'Model_Number': item_model_list})
df.to_csv('E:\GitHub\Python STUFF\Data Analyst\Portfolio-Projects\Python\AmazonWebScrapData_ComputerMonitors.csv',
          sep=',', index=False, encoding='utf-8')