In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from time import sleep
from datetime import datetime
import pandas as pd
from string import punctuation
from nltk.tokenize import MWETokenizer
from nltk import ngrams
import itertools
import os
from random import randint
from fake_useragent import UserAgent

# Function

#### Search url

In [2]:
def get_url(driver, url): # Hàm mở url 
    driver.get(url)

    WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[id="root"]')))

    total_height = int(driver.execute_script("return document.body.scrollHeight"))

    for i in range(1, total_height, 2):
        driver.execute_script("window.scrollTo(0, {});".format(i))

    check_total_height = int(driver.execute_script("return document.body.scrollHeight"))
    if check_total_height > total_height:
        for i in range(total_height, check_total_height, 2):
            driver.execute_script("window.scrollTo(0, {});".format(i))
    sleep(10)

In [3]:
def search_product(driver, keyword): # Tìm url sản phẩm theo keyword
    keyword = keyword.lower()

    url = 'https://www.lazada.vn/tag/%s//?service=official'%('-'.join(keyword.split(' ')))
    get_url(driver, url)

    element= WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[data-qa-locator="general-products"]')))
    html_of_interest=driver.execute_script('return arguments[0].innerHTML',element)
    soup = BeautifulSoup(html_of_interest, 'lxml')

    raw_links = soup.select('div[data-qa-locator="product-item"] > div > div > div > div > a')[::2]
    list_links = []

    for num_link in range(len(raw_links)):
        dict_links = {}

        # Url của sản phẩm
        link = raw_links[num_link].attrs['href']
        dict_links['url'] = 'https:' + link
        
        # Tên sản phẩm
        dict_links['Product name'] = soup.select('div[data-qa-locator="product-item"] > div > div > div > div > a > div > img')[num_link].attrs['alt']

        # Số lượng bán
        check_sold = soup.select('div[data-qa-locator="product-item"] > div > div > div > div[class="_6uN7R"]')[num_link].select('span > span:not([class])')
        if check_sold != []:
            dict_links['Product sold'] = check_sold[0].text.replace(' Đã bán', '').replace(',', '')
        else:
            dict_links['Product sold'] = 0

        list_links.append(dict_links)

    return list_links

#### Tiền xử lí dữ liệu

In [4]:
def preprocessing(text): # Hàm tiền xử lí dữ liệu string và trả về 1 string
    # Chữ hoa thành chữ thường
    pre_text = text.lower()

    # Loại bỏ dấu câu
    for c in punctuation:
        pre_text= pre_text.replace(c,' ')
    
    pre_text = " ".join(pre_text.split())

    return pre_text

#### Lọc sản phẩm không liên quan

In [5]:
def remove_accessory(list_links, keyword): # Hàm loại bỏ sản phẩm phụ kiện (ốp, bao da, kính cường lực) và trả về 1 list
    # Xóa tên thương hiệu trong Tên sản phẩm
    brand = pd.read_csv(r'..\smartphones.csv')[['brand_name', 'model']]

    for link in list_links:
        text = link['Product name']
        extract_test = extract_similar_keywords(preprocessing(text), keyword)

        for ex in extract_test:
            check_product = list(brand[brand['model'].apply(lambda x: x.lower()) == ex].values)
            if check_product != []:
                check_brand = list(brand[brand['model'].apply(lambda x: x.lower()) == ex].values[0])[0].lower()
                link['Product name'] = text.lower().replace(check_brand, '')


    with open(r'..\accessory_keyword.txt', encoding='utf-8') as f:
        accessory_keyword = f.read().splitlines()

    # Loại bỏ từ khóa chứa trong accessory_keyword
    exclude_links = []

    for link in list_links:
        for acc_kw in accessory_keyword:
            if acc_kw in link['Product name'].lower():
                exclude_links.append(link)

    list_links = [link for link in list_links if link not in exclude_links]

    return list_links

In [6]:
def extract_similar_keywords(text, keyword): # Hàm trích xuất từ khóa liên quan đến keyword trong text và trả về 1 list
    smartphone_name = pd.read_csv('..\smartphones.csv')['model']
    similar_keywords_list = [name.lower() for name in smartphone_name if keyword in name.lower()]
    extracted_keyword_list = []
    main_topic_keyword_list = []

    for similar_keyword in similar_keywords_list:
        keyword_ngram_list = []
        for n in range(2, len(similar_keyword)):
            n_gram = ngrams(similar_keyword.split(), n)

            for grams in n_gram:
                keyword_ngram_list.append(list(grams))    

        for keyword_ngram in keyword_ngram_list:
            tokenizer = MWETokenizer()
            tokenizer.add_mwe(keyword_ngram)
            phrase_list = tokenizer.tokenize(text.split())

            topic_keyword = '_'.join(keyword_ngram)
            if topic_keyword in phrase_list:
                extracted_keyword_list.append(keyword_ngram)

    extracted_keyword_list.sort()
    extracted_keyword_list = list(l for l,_ in itertools.groupby(extracted_keyword_list))

    freq_extracted_keyword = {}

    for l,_ in itertools.groupby(extracted_keyword_list):
        kw = ' '.join(l)
        freq_extracted_keyword[kw] = text.count(kw)

    freq_extracted_keyword = sorted(list(freq_extracted_keyword.items()), key = lambda key : len(key[0]), reverse=True)
    freq_extracted_keyword = {ele[0] : ele[1]  for ele in freq_extracted_keyword}

    freq_key_list = list(freq_extracted_keyword.keys())
    check_freq_dict = freq_extracted_keyword.copy()

    for key in freq_key_list:
        req = freq_extracted_keyword[key]

        for check_key in freq_key_list:
            if (key != check_key) and (key in check_key) and (check_freq_dict[check_key] > 0):
                check_freq_dict[key] -= 1

    for key, value in check_freq_dict.items():
        if value > 0:
            main_topic_keyword_list.append(key)
    
    return main_topic_keyword_list

In [7]:
def filter_links(list_links, keyword): # Hàm loại bỏ accessory, trích xuất url của sản phẩm keyword và trả về 1 list
    filter_video_url = []

    # Loại bỏ accessory
    list_links = remove_accessory(list_links, keyword)

    for link in list_links:
        product_name = link['Product name']

        pre_text = preprocessing(product_name)

        similar_keywords = extract_similar_keywords(pre_text, keyword)

        if keyword in similar_keywords:
            filter_video_url.append(link)

    return filter_video_url

#### Lấy thông tin và reviews sản phẩm

In [8]:
def get_soup(driver, url=None): # Hàm lấy code html của url và trả về soup
    if url != None:
        get_url(driver, url)
    
    driver.implicitly_wait(30)
    element= WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[id="root"]')))
    html_of_interest=driver.execute_script('return arguments[0].innerHTML',element)
    soup = BeautifulSoup(html_of_interest, 'lxml')

    return soup

In [9]:
def get_product_info(soup, sold): # Hàm trả về 1 dict thông tin sản phẩm từ soup
    dict_product_info = {}

    # Tên sản phẩm
    dict_product_info['Name'] = soup.select('h1[class="pdp-mod-product-badge-title"]')[0].text

    # Thương hiệu
    dict_product_info['Brand'] = soup.select('div[class="pdp-product-brand"] > a')[0].text

    # Check lượt đánh giá
    review_count = soup.select('div[class="pdp-review-summary"] > a')[0].text
    if review_count != 'No Ratings':
        # Số lượt đánh giá
        dict_product_info['Reviews count'] = review_count

        # Đánh giá sao
        dict_product_info['Star rating'] = soup.select('span[class="score-average"]')[0].text
    else:
        dict_product_info['Reviews count'] = 0

    # Số lượng bán
    dict_product_info['Quantity'] = sold

    # Giá bán
    dict_product_info['Price'] = soup.select('div[class="pdp-product-price"] > span')[0].text.replace('₫', '').replace('.', '').strip()

    # Ngày crawl
    dict_product_info['Crawl date'] = datetime.today().strftime('%Y-%m-%d')

    # Thông số lựa chọn (Màu sắc, dung lượng,...)
    list_option_label = soup.select('div[class="sku-selector"] > div')
    if list_option_label != []:
        for option_label in list_option_label:
            list_option = option_label.select('div > div > div[class="sku-prop-content"] > span')
            list_all_option = []

            for option in list_option:
                list_all_option.append(option.text)

            dict_product_info[option_label.select('div > h6')[0].text] = ', '.join(list_all_option)

    # Tên shop
    dict_product_info['Shop name'] = soup.select('div[class="seller-name__detail"] > a')[0].text

    # Đánh giá shop
    check_shop_rating = soup.select('div[class="seller-info-value rating-positive"]')
    if check_shop_rating != []:
        dict_product_info['Shop rating'] = check_shop_rating[0].text
    else:
        dict_product_info['Shop rating'] = 'Không có đánh giá'

    # Thông số kỹ thuật
    list_specification = soup.select('div[class="pdp-general-features"] > ul >li')
    for s in list_specification:
        name = s.select('span')[0].text.strip()
        dict_product_info[name] = s.select('div')[0].text

    # Mô tả sản phẩm
    dict_product_info['Describe'] = soup.select('div[class="html-content detail-content"]')[0].text

    return dict_product_info

In [10]:
def get_review(driver): # Hàm trả về 1 list reviews sản phẩm
    all_reviews = []
    soup = get_soup(driver)
    shop_name = soup.select('div[class="seller-name__detail"] > a')[0].text
    check_reviews = soup.select('div[class="mod-empty"]')

    if check_reviews == []:
        filter_btn = driver.find_element(By.CSS_SELECTOR,'span[class="condition"]')
        sleep(randint(20, 25))
        driver.execute_script("arguments[0].click();", filter_btn)
        filter_star = driver.find_elements(By.CSS_SELECTOR,'li[class="next-menu-item"]')[1:]

        for star_num in range(1, len(filter_star) + 1):
            driver.execute_script("arguments[0].click();", filter_btn)
            star = driver.find_elements(By.CSS_SELECTOR,'li[class="next-menu-item"]')[star_num]
            
            driver.execute_script("arguments[0].click();", star)
            sleep(randint(20, 25))
            soup = get_soup(driver)
            try:
                last_page = int(soup.select('div[class="next-pagination-list"] > button')[-1].text)
            except:
                last_page = 1
            for page_num in range(last_page):
                # Lấy thông tin review
                list_review = soup.select('div[class="mod-reviews"] > div')

                for review in list_review:
                    dict_review = {}

                    # Tên shop
                    dict_review['Shop name'] = shop_name

                    # Tên
                    dict_review['Reviewer name'] = review.select('div[class="middle"] > span')[0].text

                    # Nội dung
                    dict_review['Content'] = review.select('div > div[class="content"]')[0].text

                    # Đánh giá
                    list_star = review.select('div[class="top"] > div > img')
                    rating_score = 0

                    for star in list_star:
                        if star.attrs['src'] == '//laz-img-cdn.alicdn.com/tfs/TB19ZvEgfDH8KJjy1XcXXcpdXXa-64-64.png':
                            rating_score += 1

                    dict_review['Rating'] = rating_score

                    # Thời gian đánh giá
                    dict_review['Rating date'] = review.select('div[class="top"] > span')[0].text

                    # Crawl date
                    dict_review['Crawl date'] = datetime.today().strftime('%Y-%m-%d')
                    all_reviews.append(dict_review)

                # Bấm chuyển page
                try:
                    nexr_page_btn = driver.find_element(By.CSS_SELECTOR, 'button[class="next-btn next-btn-normal next-btn-medium next-pagination-item next"]')
                    driver.execute_script("arguments[0].click();", nexr_page_btn)
                    sleep(30)
                except:
                    break
                # Lấy soup mới
                soup = get_soup(driver)

    return all_reviews

#### Lưu file

In [11]:
def save_csv(list_of_dict, folder_path, file_name):
    save_loc = r'%s%s.csv'%(folder_path, file_name)
    exist_file = os.path.exists(save_loc)

    if not exist_file:
        df = pd.DataFrame(list_of_dict)
        df.to_csv(save_loc, index=False)
    else:
        df = pd.read_csv(save_loc)
        df_new = pd.DataFrame(list_of_dict)
        concat_file = pd.concat([df, df_new], ignore_index = True)

        concat_file.to_csv(save_loc, index=False)

# Main

In [31]:
ua = UserAgent()
user_agent = ua.random

In [32]:
# Mở trình duyệt Google Chrome
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito') # Tab ẩn danh
options.add_argument("--start-maximized") # Full window
# options.add_argument('headless') # Không hiển thị chrome
options.add_argument(f'user-agent={user_agent}')
# options.add_argument('--proxy-server=%s'%PROXY)
s = Service('../chromedriver.exe')
driver = webdriver.Chrome(service=s, options=options)

In [33]:
# Nhập keyword (tên sản phẩm cần tìm kiếm)
keyword = input('Keyword: ')

# Tìm sản phẩm dựa vào keyword
list_links = search_product(driver, keyword)

# Lọc sản phẩm không liên quan và trích xuất url
filter_url_list = filter_links(list_links, keyword)

In [34]:
product_info = []
product_reviews = []

for product_url in filter_url_list:
    product_soup = get_soup(driver, url=product_url['url'])

    product_info.append(get_product_info(product_soup, sold=product_url['Product sold']))
    
    product_reviews = product_reviews + get_review(driver)

driver.close()

#### Save file

In [38]:
folder_path = r"..\..\Data\Lazada\\"

# Lưu file product_info
# file_name_info = '%s_info'%('_'.join(keyword.split(' ')))
# save_csv(product_info, folder_path, file_name_info)

# Lưu file reviews
file_name_reviews = '%s_reviews'%('_'.join(keyword.split(' ')))
save_csv(product_reviews, folder_path, file_name_reviews)