In [1]:
import os
import re
import pathlib
import requests
import time
import datetime
import pandas as pd
from requests_html import HTML
from pathlib import Path
from selenium import webdriver
from selenium.webdriver.firefox.options import Options #open browser driver without the actual opening of browser

In [2]:
#using pathlib instead of os library
BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / "data" # os.path.join(BASE_DIR, 'data')
if not DATA_DIR.exists(): # os.path.exists(DATA_DIR)
    DATA_DIR.mkdir(exist_ok=True) # os.makedirs(DATA_DIR, exist_ok=True)
    
product_category_output = DATA_DIR / "category-products.csv"
product_output = DATA_DIR / "products.csv"

In [3]:
options = Options()
options.add_argument("--headless")
driver = webdriver.Firefox(options=options)

In [4]:
# referance  https://www.amazon.in/Kaspersky-Security-Latest-Version-Delivery/dp/B073VKKNN9

categories = [
    {"category": "Home & kitchen", "url": "https://www.amazon.in/gp/bestsellers/kitchen/"},
    {"category": "cars and motorbike", "url": "https://www.amazon.in/gp/bestsellers/automotive/"},
    {"category": "software", "url": "https://www.amazon.in/gp/bestsellers/software/"}
]
regex = [
    r"https://www.amazon.in/(?P<data>[\w-]+)/dp/(?P<id>[\w-]+)/" #this is quite used at the time 
]
# other_amazon_url_naming_scheme_referances = [ 
#     r"https://www.amazon.in/gp/product/(?P<id>[\w-]+)/",
#     r"https://www.amazon.in/dp/(?P<id>[\w-]+)/", ]

In [5]:
# from this "https://www.amazon.in/Kaspersky-Security-Latest-Version-Delivery/dp/B073VKKNN9" -> id=B073VKKNN9
def extract_id_from_url(url,regex=regex):
    product_id = None
    for regex_str in regex:
        regex = re.compile(regex_str)
        match = regex.match(url)
        if match != None:
            try:
                product_id = match['id']
            except:
                pass
    return product_id


In [6]:
# extracting the links to the products not other links like review are filtered out using product_id on each page
def clean_links(page_links=[], category=None):
    final_page_links = []
    for url in page_links:
        product_id = extract_id_from_url(url)
        if product_id != None:
            final_page_links.append({"url": url, "product_id": product_id, "category": category['category']})
    return final_page_links

# parse each link in category section and return the all product links
def scrape_and_generate(categories=None,save=False):
    all_links = []
    for category in categories:
        time.sleep(3.0)
        url = category.get("url") #from dict
        driver.get(url)
        body_element = driver.find_element_by_css_selector("body")
        html = body_element.get_attribute("innerHTML")
        html_obj = HTML(html=html)
        page_links = [f"https://www.amazon.in{x}" for x in html_obj.links if x.startswith("/")] #convention followed in amazon site
        cleaned_links = clean_links(page_links=page_links, category=category)
        all_links += cleaned_links
        if save==True:
            category_df = pd.DataFrame(all_links)
            category_df.to_csv(product_category_output, index=False)
    return all_links

In [7]:
# scrape the actual product pages that are short listed from so many links 
def scrape_product(url):
    title = "#productTitle"
    price = "#priceblock_ourprice"
#     message = "#upsell-message"
    driver.get(url)
    time.sleep(5.0)
    body_element = driver.find_element_by_css_selector("body")
    html = body_element.get_attribute("innerHTML")
    html_obj = HTML(html=html)
    product_title = html_obj.find(title, first=True).text
    product_price = html_obj.find(price, first=True).text
#     product_message = html_obj.find(message, first=True).text
    return product_title, product_price #, product_message


In [8]:
#apply to each row of dataframe
def df_row_scrape(row):
    link = row['url']
    done = 0
    try:
        done = row['scraping_done']
    except:
        pass
    if done == 1 or done == "1":
        print("skipped")
        return row
    product_id = row['product_id']
    title, price = (None, None)
    try:
        title, price = scrape_product(link)
    except:
        pass
    row['title'] = title
    row['price'] = price
#     row['message'] = message
    row['scraping_done'] = 1
    row['timestamp'] = datetime.datetime.now().timestamp()
    print(link, title, price, datetime.datetime.now().timestamp())
    return row

## Now lets run

In [9]:
d = scrape_and_generate(categories,save=True)

In [10]:
data = pd.read_csv(product_category_output)
data.head()


Unnamed: 0,url,product_id,category
0,https://www.amazon.in/Amazon-Brand-Solimo-Vege...,B07FFG653K,Home & kitchen
1,https://www.amazon.in/Ahmedabad-Cotton-Double-...,B07CN7NJQS,Home & kitchen
2,https://www.amazon.in/Raawan-Durable-Plastic-O...,B0851SZWQS,Home & kitchen
3,https://www.amazon.in/Usha-EI-1602-1000-Watt-L...,B008YW3CYM,Home & kitchen
4,https://www.amazon.in/Pigeon-Stovekraft-Handy-...,B07X2RGYYL,Home & kitchen


In [None]:
data_temp = data.iloc[:20,:].copy() #data is shortened for ease of time
data_temp = data_temp.apply(df_row_scrape, axis=1)
data_temp.shape #some fields maybe None ;loading issues may occur in the backend so grabbing is not properly done 

https://www.amazon.in/Amazon-Brand-Solimo-Vegetable-Chopper/dp/B07FFG653K/ref=zg_bs_kitchen_18/259-6753901-6196639?_encoding=UTF8&psc=1&refRID=751EEY1A6SE5C7205BJD Amazon Brand - Solimo 500 ml Large Vegetable Chopper with 3 Blades, Green ₹ 259.00 1598488095.050859
https://www.amazon.in/Amazon-Brand-Solimo-Vegetable-Chopper/dp/B07FFG653K/ref=zg_bs_kitchen_18/259-6753901-6196639?_encoding=UTF8&psc=1&refRID=751EEY1A6SE5C7205BJD Amazon Brand - Solimo 500 ml Large Vegetable Chopper with 3 Blades, Green ₹ 259.00 1598488102.390144
https://www.amazon.in/Ahmedabad-Cotton-Double-Bedsheet-Pillow/dp/B07CN7NJQS/ref=zg_bs_kitchen_27/259-6753901-6196639?_encoding=UTF8&psc=1&refRID=751EEY1A6SE5C7205BJD HUESLAND by Ahmedabad Cotton 144 TC 100% Cotton Double Bedsheet with 2 Pillow Covers - Yellow, Grey ₹ 599.00 1598488118.047068


In [None]:
data_temp.head()

In [None]:
data_temp.to_csv("data/result.csv",index=False)