In [1]:
import html
import json
import re
import time
import random
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from concurrent.futures import ThreadPoolExecutor

# ================= CONFIG =================
BASE_LIST_URL = "https://meeyland.com/mua-ban-nha-dat-ho-chi-minh-b43?page={}"
START_PAGE = 1
END_PAGE = 1       # Ch·ªânh s·ªë trang c·∫ßn c√†o
CHECKPOINT_PAGE = 50  # L∆∞u checkpoint m·ªói 50 trang
MAX_WORKERS = 4
OUTPUT_FINAL = "meeyland_hcm_final.csv"

# ================= UTILS =================
def extract_id(link):
    m = re.search(r"/(\d{6,})$", link)
    return m.group(1) if m else None

def price_to_billion(text):
    if not text: return None
    t = text.lower().replace(",", ".")
    m = re.search(r"([\d\.]+)", t)
    return float(m.group(1)) if m else None

def clean_area(text):
    if not text: return None
    m = re.search(r"([\d\.]+)\s*m", text.lower())
    return float(m.group(1)) if m else None

def extract_district(address):
    if not address: return "N/A"
    addr = address.lower()
    if "th·ªß ƒë·ª©c" in addr: return "TP Th·ªß ƒê·ª©c"
    m = re.search(r"(qu·∫≠n\s*\d+|q\.\d+|qu·∫≠n\s*[a-z√†-·ªπ\s]+|huy·ªán\s*[a-z√†-·ªπ\s]+)", addr)
    return m.group(1).title() if m else "N/A"

# ================= DRIVER =================
def init_driver():
    options = Options()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1366,768")
    options.add_argument("--headless")
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# ================= CRAWL DETAIL =================
def crawl_detail_task(url, page_num):
    driver = init_driver()
    # Kh·ªüi t·∫°o ƒë√∫ng c√°c tr∆∞·ªùng theo ·∫£nh b·∫°n g·ª≠i
    data = {
        "id": extract_id(url),
        "Page": page_num,
        "Title": "N/A",
        "Price_Raw": "N/A",
        "Price_Billion": None,
        "Area_m2": None,
        "District": "N/A",
        "Address": "N/A",
        "Bedrooms": None,
        "Toilets": None,
        "Post_Time": "N/A",
        "Link": url,
        "Description": "N/A"
    }

    try:
        driver.get(url)
        wait = WebDriverWait(driver, 10)

        # Click "Xem th√™m" cho c·∫£ 2 ph·∫ßn
        try:
            wait.until(EC.presence_of_element_located((By.XPATH, "//span[contains(text(),'Xem th√™m')]")))
            btns = driver.find_elements(By.XPATH, "//span[contains(text(),'Xem th√™m')]")
            for b in btns: driver.execute_script("arguments[0].click();", b)
            time.sleep(0.5)
        except: pass

        soup = BeautifulSoup(driver.page_source, "html.parser")

        # ƒê·ªï d·ªØ li·ªáu v√†o c√°c tr∆∞·ªùng c·ªë ƒë·ªãnh
        data["Title"] = soup.find("h1").get_text(strip=True) if soup.find("h1") else "N/A"

        price_tag = soup.select_one("h2.text-error-600")
        if price_tag:
            data["Price_Raw"] = price_tag.get_text(strip=True)
            data["Price_Billion"] = price_to_billion(data["Price_Raw"])

        addr_tag = soup.select_one("div.text-fs-14.font-medium.text-primary-600")
        if addr_tag:
            data["Address"] = addr_tag.get_text(strip=True)
            data["District"] = extract_district(data["Address"])

        desc_div = soup.select_one("div.article-description div.break-words")
        data["Description"] = desc_div.get_text(" ", strip=True) if desc_div else "N/A"

        # C√†o PN, WC, Di·ªán t√≠ch t·ª´ c√°c icon/span
        spans = soup.select("span.text-fs-14")
        for sp in spans:
            t = sp.get_text().lower()
            if "m2" in t: data["Area_m2"] = clean_area(t)
            if "pn" in t:
                m = re.search(r"\d+", t)
                if m: data["Bedrooms"] = m.group()
            if "wc" in t:
                m = re.search(r"\d+", t)
                if m: data["Toilets"] = m.group()

        # C√†o ƒë·ªông m·ª•c "Th√¥ng tin chi ti·∫øt" (M·∫∑t ti·ªÅn, Chi·ªÅu s√¢u...)
        prop_box = soup.find("div", id="property")
        if prop_box:
            items = prop_box.find_all("div", class_="flex items-start")
            for item in items:
                ss = item.find_all("span")
                if len(ss) >= 2:
                    data[ss[0].get_text(strip=True)] = ss[1].get_text(strip=True)

        post_tag = soup.find("p", string=re.compile("Ng√†y ƒëƒÉng"))
        if post_tag: data["Post_Time"] = post_tag.get_text().replace("Ng√†y ƒëƒÉng:", "").strip()

    except Exception as e:
        print(f"   [ERR] {url[-10:]}: {e}")
    finally:
        driver.quit()
    return data

# ================= MAIN =================
def main():
    all_data = []
    batch_links = []

    print(f"üöÄ B·∫ÆT ƒê·∫¶U C√ÄO: {START_PAGE} -> {END_PAGE} (Checkpoint m·ªói {CHECKPOINT_PAGE} trang)")
    list_driver = init_driver()

    try:
        for p in range(START_PAGE, END_PAGE + 1):
            print(f"--- Qu√©t List Trang {p} ---")
            list_driver.get(BASE_LIST_URL.format(p))
            time.sleep(1.5)
            soup = BeautifulSoup(list_driver.page_source, "html.parser")

            p_links = []
            for a in soup.select("a[href]"):
                href = a["href"]
                if "ho-chi-minh" in href and re.search(r"/\d{6,}$", href):
                    full = "https://meeyland.com" + href
                    if full not in [x[0] for x in batch_links]:
                        p_links.append(full)
                        batch_links.append((full, p)) # L∆∞u k√®m s·ªë trang

            print(f"   T√¨m th·∫•y {len(p_links)} tin.")

            if p % CHECKPOINT_PAGE == 0 or p == END_PAGE:
                print(f"\n‚ö° ƒêang c√†o chi ti·∫øt Batch {len(batch_links)} link...")
                with ThreadPoolExecutor(max_workers=MAX_WORKERS) as exe:
                    futures = [exe.submit(crawl_detail_task, link, p_num) for link, p_num in batch_links]
                    for i, f in enumerate(futures):
                        res = f.result()
                        all_data.append(res)
                        print(f"   [{i+1}/{len(batch_links)}] Xong: {res['id']}")

                # L∆∞u checkpoint
                pd.DataFrame(all_data).to_csv(f"checkpoint_p{p}.csv", index=False, encoding="utf-8-sig")
                batch_links = [] # Reset link cho ƒë·ª£t sau
                print(f"üíæ ƒê√£ l∆∞u checkpoint_p{p}.csv\n")

    finally:
        list_driver.quit()

    if all_data:
        df = pd.DataFrame(all_data)
        # ƒê·∫£m b·∫£o c√°c c·ªôt c·ªë ƒë·ªãnh xu·∫•t hi·ªán ƒë·∫ßu ti√™n theo ƒë√∫ng th·ª© t·ª± ·∫£nh b·∫°n g·ª≠i
        cols_fixed = ["id", "Page", "Title", "Price_Raw", "Price_Billion", "Area_m2", "District", "Address", "Bedrooms", "Toilets", "Post_Time", "Link", "Description"]
        cols_dynamic = [c for c in df.columns if c not in cols_fixed]
        df = df[cols_fixed + cols_dynamic]

        df.to_csv(OUTPUT_FINAL, index=False, encoding="utf-8-sig")
        print(f"\n‚úÖ HO√ÄN T·∫§T! T·ªïng {len(all_data)} tin. File: {OUTPUT_FINAL}")

if __name__ == "__main__":
    main()

usage: ipykernel_launcher.py [-h] [-v] [-s STYLE] [-b] [-j] identifier
ipykernel_launcher.py: error: unrecognized arguments: -f


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
