In [1]:
import pandas as pd

from unidecode import unidecode
from pathlib import Path
import os
import sys
import time

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
def crawl_all_categories():

    response = requests.get("https://hasaki.vn/")

    soup = BeautifulSoup(response.content, "html.parser")
    
    json_categories = []

    for category_link in soup.find_all('a', class_='text_dmuc'):
        
        category = {}
        
        category['Tên danh mục'] = category_link.text
        
        category['Liên kết'] = category_link.attrs['href']

        category['Mã danh mục'] = category['Liên kết'].replace('.html','').split('-')[-1].replace('c','')
        
        json_categories.append(category)

    df_categories = pd.json_normalize(json_categories)

    return df_categories

In [3]:
def crawl_comments_selenium(product_link, product_id):
    
    # Initialize the WebDriver (you might need to specify the path to the ChromeDriver executable)
    driver = webdriver.Chrome()

    # Open the web page
    driver.get(product_link)

    # Wait until the pagination is present
    pagination = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "pagination_comment")))

    # Get the number of pages from the data-max attribute of the next button
    next_button = driver.find_element(By.CSS_SELECTOR, ".item_next_sort")
    max_page = int(next_button.get_attribute("data-max"))

    comments_of_product = []

    for page in range(1, max_page + 1):
        
        try:
            # Locate the page link by its rel attribute
            page_link = driver.find_element(By.CSS_SELECTOR, f"a[rel='{page}']")
            driver.execute_script("arguments[0].scrollIntoView(true);", page_link)
            driver.execute_script("arguments[0].click();", page_link)  # Use JavaScript to click the link
            
            # Wait for the page to load comments (adjust the wait time as needed)
            time.sleep(2)

            comments = driver.find_elements(By.CLASS_NAME, 'item_comment')

            for comment in comments:

                dict_comment = {}

                comment_html = comment.get_attribute('outerHTML')
                
                soup = BeautifulSoup(comment_html, 'html.parser')

                dict_comment['Mã sản phẩm'] = product_id

                dict_comment['Ngày'] = soup.find('div', class_='timer_comment').text

                dict_comment['Tên khách hàng'] = soup.find('div', class_='title_comment').text.strip()

                dict_comment['Số sao đánh giá'] = int(soup.find('div', class_='number_start').attrs['style'].split(':')[1].replace('%', '').replace(';',''))/20

                dict_comment['Nội dung đánh giá'] = soup.find('div', class_='content_comment').text

                comments_of_product.append(dict_comment)

        except Exception as e:
            print(f"Error clicking page {page}: {e}")

    # Close the driver
    driver.quit()

    return comments_of_product

In [4]:
def extract_product(product_link, product_sold, category_id):
    
    reponse = requests.get(product_link)

    soup = BeautifulSoup(reponse.content, 'html.parser')

    product = {}

    product['Liên kết'] = product_link

    product['Ảnh sản phẩm'] = soup.find('img', id='zoom_01').attrs['src']

    product_id = soup.find('span', class_='item-sku').text.split(':')
    
    product['Mã sản phẩm'] = product_id[1].strip()

    product['Tên sản phẩm'] = soup.find('span', class_='product__title').text

    product['Mã danh mục'] = category_id

    product['Số sản phẩm đã bán'] = product_sold

    product['Giá'] = soup.find('span', id='product-final_price').text.replace('\n', '').strip()

    hasaki_price_info = soup.find('div', class_='hasaki-price-info')

    if hasaki_price_info:

        product['Giá thị trường'] = soup.find('span', id='market_price').text

        product['Tiết kiệm'] = soup.find('span', id='save_money').text

        product['Phần trăm giảm'] = soup.find('span', id='save_money_percent').text

    product['Đánh giá trung bình'] = soup.find('div', class_='txt_numer').text

    product['Số lượt đánh giá'] = soup.find('div', class_='txt_total_nhanxet').text.split(' ')[0]

    product['Số lượt hỏi đáp'] = soup.find('a', id='click_scroll_qa').text.strip(' ').split(' ')[0]

    tb_info_sanpham = soup.find('table', class_='tb_info_sanpham').find_all('tr')

    for info in tb_info_sanpham:
        
        td = info.find_all('td')
        
        info_name = td[0].text
        
        info_content = td[1].text
        
        product[info_name] = info_content

    box_thanhphanchinh = soup.find('div', id='box_thanhphanchinh').find('div', class_='ct_box_detail')
    if box_thanhphanchinh is not None:
        product['Thành phần sản phẩm'] = box_thanhphanchinh.text
    else:
        product['Thành phần sản phẩm'] = 'Không có thông tin'

    box_huongdansudung = soup.find('div', id='box_huongdansudung').find('div', class_='ct_box_detail')
    if box_huongdansudung is not None:
        product['Hướng dẫn sử dụng'] = box_huongdansudung.text
    else:
        product['Hướng dẫn sử dụng'] = 'Không có thông tin'

    if (int(product['Số lượt đánh giá']) > 0) and (int(product['Số lượt đánh giá'])) <= 10 :
        
        comments = soup.find_all('div', class_='item_comment')

        comments_of_product = []

        for comment in comments:

            dict_comment = {}

            dict_comment['Mã sản phẩm'] = product['Mã sản phẩm']

            dict_comment['Ngày'] = comment.find('div', class_='timer_comment').text

            dict_comment['Tên khách hàng'] = comment.find('div', class_='title_comment').text.strip()

            dict_comment['Số sao đánh giá'] = int(comment.find('div', class_='number_start').attrs['style'].split(':')[1].replace('%', '').replace(';',''))/20

            dict_comment['Nội dung đánh giá'] = comment.find('div', class_='content_comment').text

            comments_of_product.append(dict_comment)

        return product, comments_of_product
    
    if (int(product['Số lượt đánh giá']) > 10):
        
        comments_of_product = crawl_comments_selenium(product['Liên kết'], product['Mã sản phẩm'])

        return product, comments_of_product
    
    return product, False

In [5]:
def crawl_all_products_in_category(category_name, category_link, category_id):

    page = 1

    json_products = []

    json_comments = []

    total_products = 0
    
    while True:
        
        response = requests.get(f"{category_link}?p={page}")

        soup = BeautifulSoup(response.content, 'html.parser')

        products = soup.find_all('div', class_='ProductGridItem__itemOuter')

        if len(products) == 0:
            
            sys.stdout.write(f"\nAll products in the '{category_name}' category have been successfully retrieved\n")
            sys.stdout.write('\n')
            
            break

        else:
            for product in products:
                
                product_link = product.find('a', class_='v3_thumb_common_sp').attrs['href']

                item_count_by = product.find('span', class_='item_count_by')
                if item_count_by is not None:
                    product_sold = item_count_by.text.strip()
                else:
                    product_sold = 0

                dict_product, comments_of_product = extract_product(product_link, product_sold, category_id)

                json_products.append(dict_product)

                if comments_of_product != False:
                    
                    json_comments.append(comments_of_product)

                total_products+=1

                sys.stdout.write(f'Category: {category_name} | Page : {page} | Total: {total_products}\r')

        page += 1

    json_comments = sum(json_comments, [])

    return json_products, json_comments

In [25]:
if __name__ == '__main__':
    
    df_categories = crawl_all_categories()
    df_categories.to_csv('../data/raw/hasaki_categories.csv', encoding='utf-8-sig', index=False)

    for index in range(0,df_categories.shape[0]):

        category_name = df_categories['Tên danh mục'].loc[index]
        category_link = df_categories['Liên kết'].loc[index]
        category_id = df_categories['Mã danh mục'].loc[index]
        
        json_products, json_comments = crawl_all_products_in_category(category_name, category_link, category_id)

        df_products = pd.json_normalize(json_products)

        df_comments = pd.json_normalize(json_comments)

        directory = unidecode(category_name.lower()).replace(' ','_')

        directory_path = Path(f'../data/raw/{directory}')

        directory_path.mkdir(parents=True, exist_ok=True)

        df_products.to_csv(directory_path / 'products.csv', encoding='utf-8-sig', index=False)
        
        df_comments.to_csv(directory_path / 'comments.csv', encoding='utf-8-sig', index=False)

In [17]:
def merged_csv(object):
    
    df_merged = pd.DataFrame()
    
    paths = [Path(f'../data/raw/{directory}/{object}.csv') for directory in os.listdir('../data/raw') if os.path.isdir(Path(f'../data/raw/{directory}'))]

    for path in paths:
        df = pd.read_csv(path)
        df_merged = pd.concat([df_merged, df],ignore_index=True)

    return df_merged

In [18]:
objects = ['products', 'comments']

for object in objects:
    df_merged = merged_csv(object)
    df_merged.to_csv(f'../data/raw/hasaki_{object}.csv', encoding='utf-8-sig', index=False)