In [30]:
import requests
from requests_html import HTML
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import re
from pathlib import Path
import pandas as pd

In [32]:
base_dir = Path.cwd()
data_dir = base_dir / "data" # equal to os.join()

if not data_dir.exists():
    data_dir.mkdir(exist_ok = True)
    
products_links = data_dir / "category-products.csv" 

In [2]:
options = Options()
options.add_argument("--headless") # for selenium to work without opening a web browser
driver = webdriver.Chrome(options = options)

In [3]:
categories = [
    'https://www.amazon.in/gp/bestsellers/books/',
    'https://www.amazon.in/gp/bestsellers/toys/',
    'https://www.amazon.in/gp/bestsellers/toys/1378290031/ref=zg_bs_nav_t_1_t'
]

In [13]:
first_url = categories[1]

In [14]:
driver.get(first_url)
body_el = driver.find_element_by_css_selector('body')
html_str = body_el.get_attribute("innerHTML")
html_obj = HTML(html = html_str)

In [15]:
new_links = [ x for x in html_obj.links if x.startswith('/') and 'product-reviews/' not in x]

In [16]:
# page_links

In [17]:
def scrape_product_page(url, title_lookup = "#titleSection", price_lookup = "#priceblock_ourprice"):
    driver.get(url)
    time.sleep(1.2)
    body_el = driver.find_element_by_css_selector('body')
    html_str = body_el.get_attribute("innerHTML")
    html_obj = HTML(html = html_str)
    product_title = html_obj.find(title_lookup, first = True).text
    product_price = html_obj.find(price_lookup,first = True).text
    return product_title, product_price

In [18]:
pattern = r'https://www.amazon.in/(?P<slug>[\w-]+)/dp/(?P<product_id>[\w-]+)/'
# we're naming these       ^^^                  ^^^

In [19]:
regex_options = [
    r'https://www.amazon.in/dp/product/(?P<product_id>[\w-]+)/',
    r'https://www.amazon.in/dp/(?P<product_id>[\w-]+)/',
    r'https://www.amazon.in/(?P<slug>[\w-]+)/dp/(?P<product_id>[\w-]+)/'
]

def extract_product_id(url):
    product_id = None
    
    for regex_pattern in regex_options:
        regex = re.compile(regex_pattern)
        my_match = regex.match(url)
        
        if my_match is not None:
            try:
                product_id = my_match['product_id']
                
            except:
                pass
    
    return product_id

In [20]:
page_links = [f'https://www.amazon.in{x}' for x in new_links]

final_page_links = []

for url in page_links:
    prod_id = extract_product_id(url)
    if prod_id is not None:
        final_page_links.append({"url":url, "product_id": prod_id})

In [22]:
def scrape(links = []):
    data_extracted = []
    
    for obj in links:
        link = obj['url']
        prod_id = obj['product_id']
        title, price = (None, None)
        
        try:
            print("inside try")
            title, price = scrape_product_page(link)

        except:
            print("inside except")
            pass

        if title is not None and price is not None:
            print(link)
            print(f'title = {title}, price = {price}')
            
        product_data = {
            'url': link,
            'product_id': prod_id,
            'title': title,
            'price': price
        }
        
        data_extracted.append(product_data)
        
    return data_extracted

In [23]:
extracted_data = scrape(final_page_links)

inside try
https://www.amazon.in/iQKids-Educational-Physical-Chrome-Finish/dp/B07RM3555R/ref=zg_bs_toys_12?_encoding=UTF8&psc=1&refRID=977R9BVH0674JD5TWGY8
title = GeoKraft Educational Political 5 Inches Laminated Small Globe with Plastic Arc and Base / World Globe / Home Decor / Office Decor / Gift Item (Blue), price = ₹ 288.00
inside try
https://www.amazon.in/Plastic-Bullet-Bullets-N-Strike-20-Pieces/dp/B07SGG4YTY/ref=zg_bs_toys_45?_encoding=UTF8&psc=1&refRID=977R9BVH0674JD5TWGY8
title = SYGA Plastic Foam Toy Bullet Dart Bullets for Nerf N-Strike Elite Guns, 20-Pieces, Blue, price = ₹ 159.00
inside try
https://www.amazon.in/Extra-Lovable-Huggable-Girlfriend-Birthday/dp/B07CXLFG5P/ref=zg_bs_toys_1?_encoding=UTF8&psc=1&refRID=977R9BVH0674JD5TWGY8
title = HUG 'n' FEEL SOFT TOYS Long Soft Lovable hugable Cute Giant Life Size Teddy Bear (2 Feet, Red), price = ₹ 449.00
inside try
https://www.amazon.in/Homecute-Type-Jumbo-House-Girls/dp/B07MHLHKX5/ref=zg_bs_toys_32?_encoding=UTF8&psc=1&refR

https://www.amazon.in/Frank-10202-the-Jungle/dp/B007OUAP5E/ref=zg_bs_toys_8?_encoding=UTF8&psc=1&refRID=977R9BVH0674JD5TWGY8
title = Frank - 10202 The Jungle Puzzle For 3 Year Old Kids And Above, price = ₹ 88.00
inside try
https://www.amazon.in/Fisher-Pri-71024-Babys-Blocks/dp/B00005N9YF/ref=zg_bs_toys_39?_encoding=UTF8&psc=1&refRID=977R9BVH0674JD5TWGY8
title = Fisher Price Baby's First Blocks, price = ₹ 299.00
inside try
https://www.amazon.in/Jesper-Orignal-Teddy-Bear-Animals/dp/B089KHDF87/ref=zg_bs_toys_43?_encoding=UTF8&psc=1&refRID=977R9BVH0674JD5TWGY8
title = Jesper Orignal Teddy Bear Animals for Kids - 2 Feet (61 cm, Red), price = ₹ 399.00
inside try
inside except
inside try
inside except
inside try
https://www.amazon.in/Smartcraft-Ultimate-Kitchen-Cooking-Suitcase/dp/B01MU75SMW/ref=zg_bs_toys_36?_encoding=UTF8&psc=1&refRID=977R9BVH0674JD5TWGY8
title = Smartcraft Ultimate Kid Chef Bring Along Kitchen Cooking Suitcase Set (26 Pieces) - Multicolor, price = ₹ 619.00
inside try
https

In [33]:
category_df = pd.DataFrame(extracted_data)

In [38]:
category_df.to_csv(products_links, index=False)