In [5]:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time
import pandas as pd

# ===== Cấu hình Chrome =====
chrome_options = Options()
chrome_options.add_argument("--headless")  # chạy ngầm, bỏ nếu muốn xem trình duyệt
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# ===== Danh sách URL muốn crawl =====
urls = {
    "Điện thoại": "https://www.lazada.vn/catalog/?q=%C4%91i%E1%BB%87n%20tho%E1%BA%A1i&style=wf&rating=5",
    "Quần áo": "https://www.lazada.vn/catalog/?q=qu%E1%BA%A7n%20%C3%A1o&style=wf&rating=5",
    "Tủ lạnh": "https://www.lazada.vn/catalog/?q=t%E1%BB%A7%20l%E1%BA%A1nh&style=wf&rating=5",
    "Tivi": "https://www.lazada.vn/catalog/?q=ti%20vi&style=wf&rating=5"
}

# ===== Hàm crawl dữ liệu 1 URL =====
def crawl_lazada(url, category):
    driver.get(url)
    time.sleep(3)  # đợi trang load

    data = []

    # lấy 10 sản phẩm đầu tiên
    products = driver.find_elements(By.CSS_SELECTOR, "div.RfADt")[:10]

    for p in products:
        try:
            name = p.text.strip()
        except:
            name = "N/A"

        try:
            price = p.find_element(By.XPATH, ".//following::span[@class='ooOxS'][1]").text
        except:
            price = "N/A"

        try:
            rating_count = p.find_element(By.XPATH, ".//following::span[contains(@class,'qzqFw')][1]").text
        except:
            rating_count = "N/A"

        try:
            sold_count = p.find_element(By.XPATH, ".//following::span[contains(text(),'Đã bán')]").text
        except:
            sold_count = "N/A"

        data.append({
            "Category": category,
            "Name": name,
            "Price": price,
            "RatingCount": rating_count,
            "SoldCount": sold_count
        })

    return data


# ===== Crawl toàn bộ URL =====
all_data = []
for category, url in urls.items():
    print(f"Đang crawl {category} ...")
    all_data.extend(crawl_lazada(url, category))

# ===== Lưu ra Excel =====
df = pd.DataFrame(all_data)
df.to_excel("lazada_top_products.xlsx", index=False)
driver.quit()
print("✅ Crawl xong! Dữ liệu lưu vào lazada_top_products.xlsx")


Đang crawl Điện thoại ...
Đang crawl Quần áo ...
Đang crawl Tủ lạnh ...
Đang crawl Tivi ...
✅ Crawl xong! Dữ liệu lưu vào lazada_top_products.xlsx
