In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
import multiprocessing as mp
import time



In [None]:
driver = webdriver.Chrome('/home/jitesh/Documents/chromedriver/chromedriver')

In [2]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

In [3]:
def search_asin(driver, asin):
    url="https://www.amazon.in/dp/" + asin
    print(url)
    try:
        page = driver.get(url)
    except Exception as exp:
        return None
    return driver.page_source

In [4]:
def extract_main_item_data(soup):
    pt_obj = soup.find("span", {"id" : "productTitle"})
    prod_title = pt_obj.text if pt_obj else "Item Name Not Found"
    
    pr_obj = soup.find("span", {"id" : "priceblock_ourprice"})
    price = pr_obj.text if pr_obj else "Item Price Not Found"
    price = int(float(price[2:].replace(',',''))) if price else price
    
    seller_obj = soup.find("a", {"id" : "sellerProfileTriggerId"})
    seller = seller_obj.text if seller_obj else "Item Seller Not Found"
    
    return {"Title" : prod_title.strip(),
            "Price" : price,
            "Seller" : seller.strip()}

In [5]:
def search_offers(driver, asin):
    endpoint = "/ref=olp_aod_redir_impl1?_encoding=UTF8&aod=1"
    url="https://www.amazon.in/dp/" + asin + endpoint
    print(url)
    try:
        page = driver.get(url)
    except Exception as exp:
        return None
    return driver.page_source

In [6]:
def extract_offer_data(soup, prod_title):
    all_details = []
    all_offers = soup.find_all("div", {"id" : "aod-offer"})
    if all_offers:
        for aod in all_offers:
            price = int(float(aod.find("div", {"id" : "aod-offer-price"}).find("span", {"class" : "a-price-whole"}).text.replace(',','')))
            seller = aod.find("div", {"id" : "aod-offer-soldBy"}).find("a", {"class" : "a-size-small a-link-normal"}).text
            final_data = {"Title": prod_title.strip(), "Price" : price, "Seller" : seller.strip()}
            all_details.append(final_data)
    return all_details

In [7]:
def get_chrome_options():
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')   
    options.add_argument('--disable-extensions')   
    options.add_argument('--disable-gpu')   
    options.add_argument("--no-sandbox")   
    options.add_argument("--window-size=1920,1080")   
    options.add_argument('--disable-dev-shm-usage')
    
    return options

In [17]:
def get_asin_result(q,asin):
    options = get_chrome_options()
    driver = webdriver.Chrome('/home/jitesh/Documents/chromedriver/chromedriver', options=options)
    
    single_result = []
    main_page = search_asin(driver, asin)
    if main_page:
        main_page_soup = BeautifulSoup(main_page, 'html.parser')
        main_page_data = extract_main_item_data(main_page_soup)
        single_result.append(main_page_data)

    offer_page = search_offers(driver, asin)
    if offer_page:
        offer_page_soup = BeautifulSoup(offer_page, 'html.parser')
        offer_page_data = extract_offer_data(offer_page_soup,
                                             main_page_data.get("Title") if main_page else "No Title")
        single_result.extend(offer_page_data)
    driver.quit()
    df = pd.DataFrame(data=single_result)
    q.put(df)

In [20]:
def trigger_main():
    all_asins = ['B078SSQZBF',
                 'B078SRMZSN',
                 'B078SRMSJZ',
                 'B078STJDRN',
                 'B078SR4PS2',
                 'B078SNPXPT',
                 'B078ST6XQW',
                 'B078SNN7H3']
    all_results = mp.Queue()
    all_dfs = []
    pool = mp.Pool(processes=mp.cpu_count())
    start_time = time.time()
#     all_results = [pool.apply(get_asin_result, args = (asin,)) for asin in all_asins]
#     pool.close()
    processes = [mp.Process(target=get_asin_result, args = (all_results, asin)) for asin in all_asins]
    for p in processes: p.start()
    for p in processes: p.join()
    
    for p in processes: all_dfs.append(all_results.get())
    print("Total Time:- {}".format(time.time() - start_time))
    final_result = pd.concat(all_dfs)
    return final_result

In [31]:
def main():
    all_asins = ['B078SSQZBF',
                 'B078SRMZSN',
                 'B078SRMSJZ',
                 'B078STJDRN',
                 'B078SR4PS2',
                 'B078SNPXPT',
                 'B078ST6XQW',
                 'B078SNN7H3']
    full_results = []
    for asin in all_asins:
        single_result = []
        main_page = search_asin(asin)
        if main_page:
            main_page_soup = BeautifulSoup(main_page, 'html.parser')
            main_page_data = extract_main_item_data(main_page_soup)
            single_result.append(main_page_data)
        
        offer_page = search_offers(asin)
        if offer_page:
            offer_page_soup = BeautifulSoup(offer_page, 'html.parser')
            offer_page_data = extract_offer_data(offer_page_soup,
                                                 main_page_data.get("Title") if main_page else "No Title")
            single_result.extend(offer_page_data)
        full_results.extend(single_result)
    
    final_result = pd.DataFrame(data = full_results)
    driver.quit()
    return final_result

In [21]:
res = trigger_main()

https://www.amazon.in/dp/B078SRMSJZ
https://www.amazon.in/dp/B078SRMZSN
https://www.amazon.in/dp/B078SR4PS2
https://www.amazon.in/dp/B078SNPXPT
https://www.amazon.in/dp/B078ST6XQW
https://www.amazon.in/dp/B078STJDRN
https://www.amazon.in/dp/B078SNN7H3
https://www.amazon.in/dp/B078SSQZBF
https://www.amazon.in/dp/B078SSQZBF/ref=olp_aod_redir_impl1?_encoding=UTF8&aod=1
https://www.amazon.in/dp/B078SNN7H3/ref=olp_aod_redir_impl1?_encoding=UTF8&aod=1
https://www.amazon.in/dp/B078SRMZSN/ref=olp_aod_redir_impl1?_encoding=UTF8&aod=1
https://www.amazon.in/dp/B078STJDRN/ref=olp_aod_redir_impl1?_encoding=UTF8&aod=1
https://www.amazon.in/dp/B078ST6XQW/ref=olp_aod_redir_impl1?_encoding=UTF8&aod=1
https://www.amazon.in/dp/B078SR4PS2/ref=olp_aod_redir_impl1?_encoding=UTF8&aod=1
https://www.amazon.in/dp/B078SRMSJZ/ref=olp_aod_redir_impl1?_encoding=UTF8&aod=1
https://www.amazon.in/dp/B078SNPXPT/ref=olp_aod_redir_impl1?_encoding=UTF8&aod=1
Total Time:- 127.65690755844116


In [34]:
res.set_index(["Title", "Price", "Seller"]).sort_index()

Title,Price,Seller
M.G ENTERPRISE Acrylic Feather Soft Hand Knitting Wool Yarn (200 g),363,Cloudtail India
M.G ENTERPRISE Acrylic Feather Soft Hand Knitting Wool Yarn (200 g),363,M.G Enterprise
M.G ENTERPRISE Acrylic Soft Cream Hand Knitting Wool (200 g),313,M.G Enterprise
M.G ENTERPRISE Acrylic Soft Cream Hand Knitting Wool (200 g),327,Cloudtail India
M.G ENTERPRISE Acrylic Soft Iris Hand Knitting Wool (200 g),350,M.G Enterprise
"M.G ENTERPRISE Acrylic Soft Wool Yarn (300 g, Lavender)",490,M.G Enterprise
"M.G ENTERPRISE Feather Soft Black Hand Knitting Wool Yarn, 200 gm",350,M.G Enterprise
"M.G ENTERPRISE Feather Soft Black Hand Knitting Wool Yarn, 200 gm",363,Cloudtail India
"M.G ENTERPRISE Feather Soft Lavender Needle Knitting Acrylic Wool (200 g, Multicolour)",363,Cloudtail India
"M.G ENTERPRISE Feather Soft Lavender Needle Knitting Acrylic Wool (200 g, Multicolour)",363,M.G Enterprise


In [27]:
with open("/home/jitesh/Downloads/Amazon links.txt") as read_file:
    data = read_file.readlines()

In [30]:
[d.split("/")[-1].strip() for d in data][:100]

['B078SSQZBF',
 'B078SRMZSN',
 'B078SRMSJZ',
 'B078STJDRN',
 'B078SR4PS2',
 'B078SNPXPT',
 'B078ST6XQW',
 'B078SNN7H3',
 'B078SQQH59',
 'B078SR4RDX',
 'B078STH72Q',
 'B078STJ4WP',
 'B078SSYKGN',
 'B078ST3VCY',
 'B078SSCBTQ',
 'B08V5DWC23',
 'B08THXQW3M',
 'B07YDV79CL',
 'B07YDTS226',
 'B07YDV1TT7',
 'B08TWSQRMZ',
 'B08TWT2ST7',
 'B08TWTLR18',
 'B092S76MX2',
 'B092S6BCFD',
 'B092S7JWQT',
 'B092S77S5W',
 'B091Z4N58N',
 'B01B5HZ5FG',
 'B08JH4W19Y',
 'B08JH4FG7K',
 'B08JH567XW',
 'B08JDZ8FD6',
 'B08JH4KXTP',
 'B08JH5MMQ7',
 'B08JH4K62D',
 'B08JH3S7H5',
 'B08JH4K318',
 'B08JH4XPWP',
 'B08JH4PFYT',
 'B08JH2WPMF',
 'B08JH4C8KJ',
 'B08JF67V1N',
 'B08JH41VXG',
 'B08JH5PB7J',
 'B08JH3TSXN',
 'B08JH7FJG5',
 'B08JH4MW4H',
 'B08JH4J9YR',
 'B08JH37T8M',
 'B08JH4MKPN',
 'B08JH6QSVH',
 'B08JH2ZKXK',
 'B08JH3KWXP',
 'B08JH4GZ7Q',
 'B08JH53KKL',
 'B08JH45QZX',
 'B08JH3971L',
 'B08JH2P7HH',
 'B08JH5PM7M',
 'B08JH4XNCG',
 'B08JH3LDMT',
 'B08JH33BJD',
 'B08JH3MXFL',
 'B08JH64Z94',
 'B08JH57SNL',
 'B08JH5W3

In [11]:
mp.cpu_count()

4

In [None]:
B091SWN2L9