In [1]:
#IMPORT IMPORTANCE LIBRARY
import pandas as pd
import numpy as np

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from time import sleep

import threading
from queue import Queue

In [2]:
#FUNCTION TO OPEN N-BROWSERS BASED ON NUM OF THREADS
def open_multi_browsers(n_page):
    drivers = []
    for _ in range(n_page):
        driver = webdriver.Chrome()
        drivers.append(driver)
    return drivers

#FUNCTION TO LOAD LINK - USE ON load_multi_browsers
def load_multi_pages(driver, n):
    driver.maximize_window()
    link = f'https://tiki.vn/sach-truyen-tieng-viet/c316?page={n}'
    driver.get(link)
    sleep(3)

#FUNCTION TO LOAD THREADING OF MULTI BROWSERS
def load_multi_browsers(drivers, idx_page):
    for driver, page in zip(drivers, idx_page):
        t = threading.Thread(target = load_multi_pages, args = (driver, page))
        t.start()

#FUNCTION TO TAKE DATA FROM THREADING AND SAVE IT ON QUEUE - USE ON RunInParallel
def get_data(driver, que):
    try:
        prod_links_elems = driver.find_elements(By.CSS_SELECTOR, '.style__ProductLink-sc-7xd6qw-2.fHwskZ.product-item')
        prod_links = [i.get_attribute('href') for i in prod_links_elems]
    except TimeoutException:
        wait = WebDriverWait(driver, 10)
        element_to_wait = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.style__ProductLink-sc-7xd6qw-2.fHwskZ.product-item')))
        prod_links_elems = driver.find_elements(By.CSS_SELECTOR, '.style__ProductLink-sc-7xd6qw-2.fHwskZ.product-item')
        prod_links = [i.get_attribute('href') for i in prod_links_elems]

    page_prod_features = []

    for prod_link in prod_links:
        driver.get(prod_link)
        sleep(2)
        driver.maximize_window()
        scroll_iterations = 10
        scroll_amount = 300
        scroll_interval = 0.2 

        for _ in range(scroll_iterations):
            driver.execute_script("window.scrollBy(0, arguments[0]);", scroll_amount)
            sleep(scroll_interval)

        try:
            wait = WebDriverWait(driver, 10)
            element_to_wait = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.btn-more')))
            element_to_wait.click()
        except TimeoutException:
            print('Not btn-more')

        try:
            category_elems = driver.find_elements(By.CSS_SELECTOR, '.Breadcrumb__Wrapper-sc-1r2fjia-0.gsoENx .breadcrumb-item')
            category = [i.text for i in category_elems]
        except NoSuchElementException:
            category = np.nan

        try:
            img_elem = driver.find_element(By.CSS_SELECTOR, '.image-frame')
            img = img_elem.find_element(By.TAG_NAME, 'img').get_attribute('srcset').split(' ')[0]
        except NoSuchElementException:
            img = np.nan

        try:
            price = driver.find_element(By.CSS_SELECTOR, '.product-price__current-price').text
        except NoSuchElementException:
            price = np.nan

        try:
            discount = driver.find_element(By.CSS_SELECTOR, '.product-price__discount-rate').text
        except NoSuchElementException:
            discount = np.nan

        try:
            sale_quantities = driver.find_element(By.CSS_SELECTOR, '.styles__StyledQuantitySold-sc-1swui9f-3.bExXAB').text
        except NoSuchElementException:
            sale_quantities = np.nan

        try:
            rating = driver.find_element(By.CSS_SELECTOR, '.styles__StyledReview-sc-1swui9f-1.dXPbue').text
        except NoSuchElementException:
            rating = np.nan

        

        info_elems = driver.find_elements(By.CSS_SELECTOR, '.WidgetTitle__WidgetContainerStyled-sc-1ikmn8z-0.iHMNqO')
        for i in info_elems:
            try:
                title = i.find_element(By.CSS_SELECTOR, '.WidgetTitle__WidgetTitleStyled-sc-1ikmn8z-1.eaKcuo').text
                print(title)
                if title == 'Thông tin chi tiết':
                    info_row = i.find_elements(By.CSS_SELECTOR, '.WidgetTitle__WidgetContentStyled-sc-1ikmn8z-2.jMQTPW')
                    info = [i.text.split('\n') for i in info_row]
                    print('Success collect info')
                elif title == 'Mô tả sản phẩm':
                    describe = i.find_element(By.CSS_SELECTOR, '.style__Wrapper-sc-13sel60-0.dGqjau.content').text
                    print('Success collect describe')
                elif title == 'Thông tin nhà bán':
                    seller = i.find_element(By.CSS_SELECTOR, '.seller-name').text.split(' ')[0]
                    seller_evaluation_elems = i.find_element(By.CSS_SELECTOR, '.item.review')
                    seller_star = seller_evaluation_elems.find_element(By.CSS_SELECTOR, '.title').text
                    seller_reviews_quantity = seller_evaluation_elems.find_element(By.CSS_SELECTOR, '.sub-title').text
                    seller_follow = i.find_element(By.CSS_SELECTOR, '.item.normal .title').text
                    print('Succes collect seller info')
            except NoSuchElementException:
                print('PASS')

        features = [category, img, price, discount, sale_quantities, rating, info, describe, seller, seller_star, seller_reviews_quantity, seller_follow]
        page_prod_features.append(features)
        
    que.put(page_prod_features)

#FUNCTION TO RUN THREADING OF TAKE DATA AND RETURN IN A LIST 
def runInParallbel(func, drivers):
    threads = []
    que = Queue()

    for driver in drivers:
        print('--Running--')
        t = threading.Thread(target = func, args = (driver, que))
        t.start()
        threads.append(t)

    for t in threads:
        t.join()
    
    results = []
    while not que.empty():
        results.extend(que.get())
    
    return results

In [None]:
#RUN 5 THREADS FOR CRAWL OF 50 PAGES
n_page = 5
drivers = open_multi_browsers(n_page)
idx_page = [i for i in range(1, n_page + 1)]
all_data = pd.DataFrame()
while idx_page[0] < 50:
    load_multi_browsers(drivers, idx_page)
    sleep(5)
    all_prod_features = runInParallbel(get_data, drivers)
    page_df = pd.DataFrame(all_prod_features, columns = ['category', 'img', 'price', 'discount', 'sale_quantities', 'rating', 'info', 'describe', 'seller', 'seller_star', 'seller_reviews_quantity', 'seller_follow'])
    all_data = pd.concat([all_data, page_df], axis = 0)
    idx_page = [i + 5 for i in idx_page]

In [6]:
#SAVE IT TO CSV FILE
all_data.to_csv('book_data.csv')