In [32]:

import queue
import threading
import time

import pandas
from lxml import html
from selenium.webdriver import Chrome, Edge
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.edge.options import Options as EdgeOptions
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from tqdm import tqdm
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.microsoft import EdgeChromiumDriverManager


def GetChromeBrowser(isHeadless=True):
    opt = Options()
    if isHeadless:
        opt.add_argument("--headless")
    opt.add_argument("--mute-audio")
    opt.add_argument("--disable-notifications")
    capabilities = DesiredCapabilities.CHROME
    capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
    try:
        return Chrome(service=Service(ChromeDriverManager().install()), options=opt)
    except Exception as err:
        print(err)


def GetEdgeBrowser(isHeadless=True):
    options = EdgeOptions()
    if isHeadless:
        options.add_argument("--headless")
    options.add_argument("--mute-audio")
    options.add_argument("--disable-notifications")

    try:
        return Edge(service=Service(EdgeChromiumDriverManager().install()), options=options)
    except Exception as err:
        print(err)


resultQ = queue.Queue()
errorQ = queue.Queue()


def Scrapper(driver, url):
    driver.get(url)

    titleXpath = '//div[@id="centerCol"]/descendant::span[@id="productTitle"]/text()'
    priceXpath = '//div[@id="centerCol"]/descendant::span[@class="a-price-whole"]/text()'

    images = '//li[@data-csa-c-action="image-block-main-image-hover"]/descendant::img/@src'
    header = '//div[@id="productOverview_feature_div"]//*[not(self::style| self::script)]/descendant::tr/td[1]/span/text()'
    value = '//div[@id="productOverview_feature_div"]//*[not(self::style| self::script)]/descendant::tr/td[2]/span/text()'
    moreButton = "//*[contains(text(), 'About this item')]"
    moreInfo = '//div[@id="featurebullets_feature_div"]/descendant::*[not(self::style | self::script)]/text()'
    available = '//div[@id="availabilityInsideBuyBox_feature_div"]/descendant::*[not(self::style | self::script)]/text()'

    try:
        cc_element = WebDriverWait(driver, 25).until(
            EC.visibility_of_all_elements_located((By.XPATH, images)))
    except:
        pass
    d_data = {}
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight * .2);")

    try:
        cc_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, moreButton)))
        cc_element.click()
    except:
        pass

    try:
        cc_element = WebDriverWait(driver, 5).until(
            EC.visibility_of_all_elements_located((By.XPATH, titleXpath)))
    except:
        pass

    try:
        cc_element = WebDriverWait(driver, 5).until(
            EC.visibility_of_all_elements_located((By.XPATH, priceXpath)))
    except:
        pass
    try:
        cc_element = WebDriverWait(driver, 5).until(
            EC.visibility_of_all_elements_located((By.XPATH, moreInfo)))
    except:
        pass

    page_source = driver.page_source
    doc = html.fromstring(page_source)

    d_data['title'] = '\n'.join(doc.xpath(titleXpath))
    d_data['price'] = '\n'.join(doc.xpath(priceXpath))
    d_data['description_0'] = '\n'.join(doc.xpath(moreInfo))
    d_data['available'] = '\n'.join(doc.xpath(available))

    header = doc.xpath(header)
    values = doc.xpath(value)

    for indx, head in enumerate(header):
        d_data[head] = values[indx]

    d_data['images'] = '\n'.join(doc.xpath(images))
    d_data['product'] = url
    resultQ.put(d_data)


jobs = queue.Queue()


def WorkerLoop(progress_bar: tqdm):
    # driver = GetChromeBrowser(False)
    driver = GetEdgeBrowser(False)
    while not jobs.empty():
        url = jobs.get()
        try:
            Scrapper(driver, url)
        except:
            errorQ.put({'url': url})

            pass
        time.sleep(1)
        progress_bar.update()


i = 'amazon.csv'
path = f'amazon_files/{i}'

if i.__contains__('.csv'):
    data = pandas.read_csv(path)
else:
    data = pandas.read_excel(path)

product_column = 'a-link-normal href'
urls = list(data[product_column])

[jobs.put(_) for _ in urls]
threads = []
pbar = tqdm(total=len(urls))

for worker in range(6):
    thread = threading.Thread(target=WorkerLoop, args=(pbar,))
    thread.start()
    threads.append(thread)

for t in threads:
    t.join()


def qToDf(q):
    l = []
    while not q.empty():
        l.append(q.get())
    [q.put(_) for _ in l]
    df = pandas.DataFrame(l)
    return df


df = qToDf(resultQ)
df.to_csv(f'{"".join(i.split(".")[:-1])}_scrapped_result.csv', index=False)
qToDf(errorQ).to_csv(f'{"".join(i.split(".")[:-1])}_scrapped_error.csv', index=False)




100%|██████████| 67/67 [03:09<00:00,  2.82s/it]

  2%|▏         | 1/60 [00:28<27:45, 28.23s/it][A
  3%|▎         | 2/60 [00:37<16:32, 17.11s/it][A
  5%|▌         | 3/60 [00:37<08:53,  9.35s/it][A
  7%|▋         | 4/60 [00:38<05:32,  5.93s/it][A
  8%|▊         | 5/60 [00:44<05:29,  5.99s/it][A
 10%|█         | 6/60 [00:45<03:59,  4.44s/it][A
 12%|█▏        | 7/60 [00:46<02:47,  3.16s/it][A
 13%|█▎        | 8/60 [00:47<02:17,  2.65s/it][A
 15%|█▌        | 9/60 [00:51<02:36,  3.06s/it][A
 17%|█▋        | 10/60 [00:54<02:27,  2.95s/it][A
 18%|█▊        | 11/60 [00:55<01:54,  2.34s/it][A
 20%|██        | 12/60 [00:58<02:02,  2.54s/it][A
 22%|██▏       | 13/60 [01:01<02:11,  2.79s/it][A
 23%|██▎       | 14/60 [01:05<02:16,  2.97s/it][A
 25%|██▌       | 15/60 [01:06<01:45,  2.35s/it][A
 27%|██▋       | 16/60 [01:08<01:43,  2.35s/it][A
 28%|██▊       | 17/60 [01:11<01:45,  2.46s/it][A
 30%|███       | 18/60 [01:12<01:25,  2.03s/it][A
 32%|███▏      | 19/60 [01:12<01:00,  1.4