# LAZADA REVIEW CRAWLER 


In [None]:
!pip install -q requests pandas xlsxwriter fake-useragent beautifulsoup4 lxml

In [None]:
import os, re, json, time, random, requests
from datetime import datetime
import pandas as pd
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import http.cookiejar as cookielib
from google.colab import files

ua = UserAgent()
session = requests.Session()

session.headers.update({
    "User-Agent": ua.random,
    "Accept": "application/json, text/plain, */*",
    "Accept-Language": "vi-VN,vi;q=0.9,en-US;q=0.8,en;q=0.7",
    "Accept-Encoding": "gzip, deflate, br",
    "Referer": "https://www.lazada.vn/",
    "Origin": "https://www.lazada.vn",
    "sec-ch-ua": '"Google Chrome";v=\"129\", \"Not=A?Brand\";v=\"8\"',
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": '"Windows"',
})

print("Khởi tạo xong")

In [None]:
# UPLOAD FILE COOKIES
uploaded = files.upload()
cookie_filename = list(uploaded.keys())[0]
print(f"Đã upload: {cookie_filename}")

# Load cookies
cj = cookielib.MozillaCookieJar()
cj.load(cookie_filename, ignore_discard=True, ignore_expires=True)
session.cookies = cj

print(f"Đã load {len(cj)} cookies thành công!")

In [None]:
def extract_item_id(url):
    match = re.search(r'-i(\d+)', url)
    return match.group(1) if match else None

def crawl_lazada_reviews(product_url, max_reviews=2000, delay_min=2, delay_max=5):
    item_id = extract_item_id(product_url)
    if not item_id:
        print("Không tìm thấy item_id trong URL!")
        return None
    
    print(f"Bắt đầu cào item_id: {item_id}")
    all_reviews = []
    page = 1
    
    while len(all_reviews) < max_reviews:
        url = "https://my.lazada.vn/pdp/review/getReviewList"
        params = {
            "itemId": item_id,
            "pageSize": 50,
            "page": page,
            "filter": "0",
            "sort": "0"
        }
        
        try:
            r = session.get(url, params=params, timeout=30)
            
            if r.status_code != 200:
                print(f"HTTP {r.status_code} – Dừng lại")
                break
                
            data = r.json()
            items = data.get("model", {}).get("items", [])
            
            if not items:
                print("Hết dữ liệu rồi!")
                break
                
            all_reviews.extend(items)
            print(f"Trang {page} → +{len(items)} → Tổng: {len(all_reviews)}")
            
            page += 1
            time.sleep(random.uniform(delay_min, delay_max))
            
        except Exception as e:
            print("Lỗi:", e)
            time.sleep(10)
    
    # Lưu file
    if all_reviews:
        df = pd.json_normalize(all_reviews)
        
        # Xử lý thời gian
        if 'reviewTime' in df.columns:
            df['reviewTime'] = pd.to_datetime(df['reviewTime'], unit='ms', errors='coerce')
        
        # Lưu Excel + JSON
        excel_file = f"lazada_{item_id}_{len(all_reviews)}_reviews.xlsx"
        json_file = f"lazada_{item_id}_{len(all_reviews)}_reviews.json"
        
        df.to_excel(excel_file, index=False)
        with open(json_file, 'w', encoding='utf-8') as f:
            json.dump(all_reviews, f, ensure_ascii=False, indent=2)
        
        print(f"\nHOÀN TẤT!")
        print(f"→ Tổng: {len(all_reviews)} reviews")
        print(f"→ Excel: {excel_file}")
        print(f"→ JSON: {json_file}")
        
        # Tự động tải về máy
        files.download(excel_file)
        files.download(json_file)
        
        return df
    else:
        print("Không lấy được review nào")
        return None

In [None]:
# DÁN LINK SẢN PHẨM LAZADA VÀO ĐÂY
product_url = "https://www.lazada.vn/products/ao-thun-nam-tay-ngan-co-tron-i123456789.html"  
# Cấu hình
MAX_REVIEWS = 3000      # Muốn cào bao nhiêu (tối đa)
DELAY_MIN = 2           # Giây nghỉ giữa các request
DELAY_MAX = 5

crawl_lazada_reviews(product_url, max_reviews=MAX_REVIEWS, delay_min=DELAY_MIN, delay_max=DELAY_MAX)