In [1]:
import pandas as pd
import os
import re
import time
import sys
import json
import html
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from concurrent.futures import ThreadPoolExecutor

# ================= CONFIG =================
BASE_LIST_URL = "https://meeyland.com/mua-ban-nha-dat-ho-chi-minh-b43?page={}"
START_PAGE = 1
END_PAGE = 300
MAX_WORKERS = 5
OUTPUT_FINAL = "meeyland_hcm_total.csv"
CRAWLED_LOG = "crawled_ids.txt"

print("üîç ƒêang kh·ªüi ƒë·ªông h·ªá th·ªëng...")
DRIVER_PATH = ChromeDriverManager().install()

def init_driver():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1366,768")
    return webdriver.Chrome(service=Service(DRIVER_PATH), options=options)

# ================= UTILS =================
def extract_id(link):
    m = re.search(r"/(\d{6,})$", link)
    return m.group(1) if m else None

def price_to_billion(text):
    if not text: return None
    t = text.lower().replace(",", ".")
    m = re.search(r"([\d\.]+)", t)
    if m:
        val = float(m.group(1))
        return val if "t·ª∑" in t else val / 1000
    return None

def extract_district(address):
    if not address or address == "N/A": return "N/A"
    addr = address.lower()
    if "th·ªß ƒë·ª©c" in addr: return "TP Th·ªß ƒê·ª©c"
    m = re.search(r"(qu·∫≠n\s+\d+|q\.\d+|q\s+\d+|h\.\s+[a-z√†-·ªπ\s]+|huy·ªán\s+[a-z√†-·ªπ\s]+|qu·∫≠n\s+[a-z√†-·ªπ\s]+)", addr)
    if m:
        return m.group(1).replace("q.", "Qu·∫≠n ").replace("q", "Qu·∫≠n ").replace("h.", "Huy·ªán ").strip().title()
    return "N/A"

# ================= CORE TASK =================
def crawl_detail_task(url, page_num):
    ad_id = extract_id(url)
    driver = init_driver()
    data = {
        "id": ad_id, "Page": page_num, "Title": "N/A", "Price_Raw": "N/A",
        "Price_Billion": None, "Price_per_m2": "N/A", "Area_m2": None,
        "District": "N/A", "Address": "N/A", "Bedrooms": None, "Toilets": None,
        "Post_Time": "N/A", "Link": url, "Description": "N/A"
    }

    try:
        driver.get(url)
        # 1. Ch·ªù render v√† bung n·ªôi dung ·∫©n
        try:
            WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CLASS_NAME, "article-description")))
            driver.execute_script("""
                let btns = document.querySelectorAll('span');
                btns.forEach(b => { if(b.innerText.includes('Xem th√™m')) b.click(); });
            """)
            time.sleep(0.5)
        except: pass

        # 2. L·∫•y Address & District (Selector c·∫≠p nh·∫≠t)
        try:
            addr_el = driver.find_element(By.CSS_SELECTOR, "div.text-primary-600.line-clamp-1, div.text-fs-14.font-medium.text-primary-600")
            full_addr = driver.execute_script("return arguments[0].textContent;", addr_el).strip()
            data["Address"] = full_addr.replace("thu g·ªçn", "").strip()
            data["District"] = extract_district(data["Address"])
        except: pass

        soup = BeautifulSoup(driver.page_source, "html.parser")

        # 3. Ti√™u ƒë·ªÅ & Gi√° t·ªïng
        data["Title"] = soup.find("h1").get_text(strip=True) if soup.find("h1") else "N/A"
        p_tag = soup.select_one("h2.text-error-600")
        if p_tag:
            data["Price_Raw"] = p_tag.get_text(strip=True)
            data["Price_Billion"] = price_to_billion(data["Price_Raw"])

        # 4. X·ª¨ L√ù DI·ªÜN T√çCH (∆Øu ti√™n Tippy-content ƒë·ªÉ ch√≠nh x√°c 100%)
        # T√¨m div ch·ª©a icon di·ªán t√≠ch th√¥ng qua thu·ªôc t√≠nh data-tippy-content
        area_div = soup.find("div", attrs={"data-tippy-content": re.compile(r"Di·ªán t√≠ch", re.I)})
        if area_div:
            area_span = area_div.find("span", class_="text-fs-14")
            if area_span:
                v = re.search(r"([\d\.,]+)", area_span.get_text())
                if v: data["Area_m2"] = v.group(1).replace(",", ".")

        # Fallback: Qu√©t c√°c span n·∫øu Tippy th·∫•t b·∫°i
        for sp in soup.select("span.text-fs-14"):
            t = sp.get_text(strip=True).lower()
            if ("m2" in t or "m¬≤" in t) and "/" not in t and data["Area_m2"] is None:
                v = re.search(r"([\d\.,]+)", t)
                if v: data["Area_m2"] = v.group(1).replace(",", ".")
            elif ("m2" in t or "m¬≤" in t) and "/" in t:
                data["Price_per_m2"] = t

            if "pn" in t:
                v = re.search(r"\d+", t); data["Bedrooms"] = v.group() if v else data["Bedrooms"]
            if "wc" in t:
                v = re.search(r"\d+", t); data["Toilets"] = v.group() if v else data["Toilets"]

        # 5. M√î T·∫¢ (3 L·ªõp b·∫£o v·ªá)
        desc_text = None
        # L·ªõp 1: JSON Astro
        for astro in soup.find_all("astro-island"):
            props = astro.get("props")
            if props:
                try:
                    js = json.loads(html.unescape(props))
                    if "article" in js and js["article"].get("description"):
                        desc_text = BeautifulSoup(js["article"]["description"], "html.parser").get_text(" ", strip=True)
                        break
                except: continue

        # L·ªõp 2: Selector DOM
        if not desc_text:
            dt = soup.select_one("div.article-description div.break-words, div.article-description")
            if dt: desc_text = dt.get_text(" ", strip=True)

        # L·ªõp 3: Xpath d·ª± ph√≤ng
        if not desc_text:
            try:
                de = driver.find_element(By.XPATH, "//*[contains(text(),'M√¥ t·∫£')]/following-sibling::div")
                desc_text = de.text.strip()
            except: pass

        data["Description"] = desc_text if desc_text else "N/A"

        # 6. Qu√©t ƒë·ªông b·∫£ng thu·ªôc t√≠nh (Property)
        prop_box = soup.find("div", id="property")
        if prop_box:
            for it in prop_box.find_all("div", class_="flex items-start"):
                ss = it.find_all("span")
                if len(ss) >= 2: data[ss[0].get_text(strip=True)] = ss[1].get_text(strip=True)

        pt = soup.find("p", string=re.compile("Ng√†y ƒëƒÉng"))
        if pt: data["Post_Time"] = pt.get_text().replace("Ng√†y ƒëƒÉng:", "").strip()

    except Exception as e:
        print(f"‚ùå L·ªói t·∫°i {url}: {e}")
    finally:
        driver.quit()
        with open(CRAWLED_LOG, "a") as f: f.write(f"{ad_id}\n")

    print(f"   [DONE] ID: {ad_id} | Area: {data['Area_m2']} | Dist: {data['District']}")
    sys.stdout.flush()
    return data

# ================= RUNNER (Gi·ªØ nguy√™n) =================
def main():
    if not os.path.exists(CRAWLED_LOG): open(CRAWLED_LOG, 'w').close()
    with open(CRAWLED_LOG, 'r') as f: crawled_ids = set(line.strip() for line in f)

    all_data = []
    if os.path.exists(OUTPUT_FINAL):
        all_data = pd.read_csv(OUTPUT_FINAL).to_dict('records')

    print(f"üöÄ RESUME: ƒê√£ c√≥ {len(all_data)} tin.")

    list_driver = init_driver()
    try:
        for p in range(START_PAGE, END_PAGE + 1):
            print(f"\n--- üåê Qu√©t Trang {p} ---")
            list_driver.get(BASE_LIST_URL.format(p))
            time.sleep(2)
            soup = BeautifulSoup(list_driver.page_source, "html.parser")

            new_links = []
            for a in soup.select("a[href]"):
                h = a["href"]
                if "ho-chi-minh" in h and re.search(r"/\d{6,}$", h):
                    url = "https://meeyland.com" + h
                    if extract_id(url) not in crawled_ids:
                        new_links.append((url, p))

            if new_links:
                with ThreadPoolExecutor(max_workers=MAX_WORKERS) as exe:
                    results = list(exe.map(lambda x: crawl_detail_task(*x), new_links))
                    all_data.extend(results)
                    for r in results: crawled_ids.add(r['id'])

                df = pd.DataFrame(all_data)
                f_cols = ["id", "Page", "Title", "Price_Raw", "Price_Billion", "Price_per_m2", "Area_m2", "District", "Address", "Bedrooms", "Toilets", "Post_Time", "Link", "Description"]
                d_cols = [c for c in df.columns if c not in f_cols]
                df[f_cols + d_cols].to_csv(OUTPUT_FINAL, index=False, encoding="utf-8-sig")
                print(f"üìä ƒê√£ l∆∞u Page {p}. T·ªïng: {len(df)} tin.")
            else:
                print(f"   (Trang {p} ƒë√£ xong)")

    except KeyboardInterrupt:
        print("\nüõë ƒêang l∆∞u d·ªØ li·ªáu v√† tho√°t...")
    finally:
        list_driver.quit()
        if all_data:
            pd.DataFrame(all_data).to_csv(OUTPUT_FINAL, index=False, encoding="utf-8-sig")
        print("üíæ Xong.")

if __name__ == "__main__":
    main()

üîç ƒêang kh·ªüi ƒë·ªông h·ªá th·ªëng...
üöÄ RESUME: ƒê√£ c√≥ 0 tin.

--- üåê Qu√©t Trang 1 ---
   [DONE] ID: 306038613 | Area: 48 | Dist: N/A
   [DONE] ID: 104652329 | Area: 120 | Dist: N/A
   [DONE] ID: 102855901 | Area: 87 | Dist: N/A
   [DONE] ID: 306041245 | Area: 54 | Dist: N/A
   [DONE] ID: 306041109 | Area: 33.6 | Dist: N/A
   [DONE] ID: 104652322 | Area: 66 | Dist: N/A
   [DONE] ID: 104957777 | Area: 500 | Dist: N/A
   [DONE] ID: 104754381 | Area: 94 | Dist: N/A
   [DONE] ID: 104735889 | Area: 170 | Dist: N/A
   [DONE] ID: 104445264 | Area: 220 | Dist: N/A
   [DONE] ID: 302195316 | Area: 113 | Dist: TP Th·ªß ƒê·ª©c
   [DONE] ID: 302659571 | Area: 100 | Dist: N/A
üìä ƒê√£ l∆∞u Page 1. T·ªïng: 12 tin.

--- üåê Qu√©t Trang 2 ---
   [DONE] ID: 306041128 | Area: 75 | Dist: N/A
   [DONE] ID: 302820948 | Area: 100 | Dist: N/A
   [DONE] ID: 302949591 | Area: 81 | Dist: N/A
   [DONE] ID: 302769902 | Area: 112 | Dist: N/A
   [DONE] ID: 302969414 | Area: 128 | Dist: N/A
   [DONE] ID

Enrich d·ªØ li·ªáu qu·∫≠n 11

In [2]:
import pandas as pd
import os
import re
import time
import sys
import json
import html
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from concurrent.futures import ThreadPoolExecutor

# ================= CONFIG =================
BASE_LIST_URL = "https://meeyland.com/mua-ban-nha-dat-quan-11-ho-chi-minh-c423?page={}"
START_PAGE = 1
END_PAGE = 20
MAX_WORKERS = 5
OUTPUT_FINAL = "../../data/raw/quan11.csv"
CRAWLED_LOG = "../../data/raw/crawled_ids.txt"

print("üîç ƒêang kh·ªüi ƒë·ªông h·ªá th·ªëng...")
DRIVER_PATH = ChromeDriverManager().install()

def init_driver():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1366,768")
    return webdriver.Chrome(service=Service(DRIVER_PATH), options=options)

# ================= UTILS =================
def extract_id(link):
    m = re.search(r"/(\d{6,})$", link)
    return m.group(1) if m else None

def price_to_billion(text):
    if not text: return None
    t = text.lower().replace(",", ".")
    m = re.search(r"([\d\.]+)", t)
    if m:
        val = float(m.group(1))
        return val if "t·ª∑" in t else val / 1000
    return None

def extract_district(address):
    if not address or address == "N/A": return "N/A"
    addr = address.lower()
    if "th·ªß ƒë·ª©c" in addr: return "TP Th·ªß ƒê·ª©c"
    m = re.search(r"(qu·∫≠n\s+\d+|q\.\d+|q\s+\d+|h\.\s+[a-z√†-·ªπ\s]+|huy·ªán\s+[a-z√†-·ªπ\s]+|qu·∫≠n\s+[a-z√†-·ªπ\s]+)", addr)
    if m:
        return m.group(1).replace("q.", "Qu·∫≠n ").replace("q", "Qu·∫≠n ").replace("h.", "Huy·ªán ").strip().title()
    return "N/A"

# ================= CORE TASK =================
def crawl_detail_task(url, page_num):
    ad_id = extract_id(url)
    driver = init_driver()
    data = {
        "id": ad_id, "Page": page_num, "Title": "N/A", "Price_Raw": "N/A",
        "Price_Billion": None, "Price_per_m2": "N/A", "Area_m2": None,
        "District": "N/A", "Address": "N/A", "Bedrooms": None, "Toilets": None,
        "Post_Time": "N/A", "Link": url, "Description": "N/A"
    }

    try:
        driver.get(url)
        # 1. Ch·ªù render v√† bung n·ªôi dung ·∫©n
        try:
            WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CLASS_NAME, "article-description")))
            driver.execute_script("""
                let btns = document.querySelectorAll('span');
                btns.forEach(b => { if(b.innerText.includes('Xem th√™m')) b.click(); });
            """)
            time.sleep(0.5)
        except: pass

        # 2. L·∫•y Address & District (Selector c·∫≠p nh·∫≠t)
        try:
            addr_el = driver.find_element(By.CSS_SELECTOR, "div.text-primary-600.line-clamp-1, div.text-fs-14.font-medium.text-primary-600")
            full_addr = driver.execute_script("return arguments[0].textContent;", addr_el).strip()
            data["Address"] = full_addr.replace("thu g·ªçn", "").strip()
            data["District"] = extract_district(data["Address"])
        except: pass

        soup = BeautifulSoup(driver.page_source, "html.parser")

        # 3. Ti√™u ƒë·ªÅ & Gi√° t·ªïng
        data["Title"] = soup.find("h1").get_text(strip=True) if soup.find("h1") else "N/A"
        p_tag = soup.select_one("h2.text-error-600")
        if p_tag:
            data["Price_Raw"] = p_tag.get_text(strip=True)
            data["Price_Billion"] = price_to_billion(data["Price_Raw"])

        # 4. X·ª¨ L√ù DI·ªÜN T√çCH (∆Øu ti√™n Tippy-content ƒë·ªÉ ch√≠nh x√°c 100%)
        # T√¨m div ch·ª©a icon di·ªán t√≠ch th√¥ng qua thu·ªôc t√≠nh data-tippy-content
        area_div = soup.find("div", attrs={"data-tippy-content": re.compile(r"Di·ªán t√≠ch", re.I)})
        if area_div:
            area_span = area_div.find("span", class_="text-fs-14")
            if area_span:
                v = re.search(r"([\d\.,]+)", area_span.get_text())
                if v: data["Area_m2"] = v.group(1).replace(",", ".")

        # Fallback: Qu√©t c√°c span n·∫øu Tippy th·∫•t b·∫°i
        for sp in soup.select("span.text-fs-14"):
            t = sp.get_text(strip=True).lower()
            if ("m2" in t or "m¬≤" in t) and "/" not in t and data["Area_m2"] is None:
                v = re.search(r"([\d\.,]+)", t)
                if v: data["Area_m2"] = v.group(1).replace(",", ".")
            elif ("m2" in t or "m¬≤" in t) and "/" in t:
                data["Price_per_m2"] = t

            if "pn" in t:
                v = re.search(r"\d+", t); data["Bedrooms"] = v.group() if v else data["Bedrooms"]
            if "wc" in t:
                v = re.search(r"\d+", t); data["Toilets"] = v.group() if v else data["Toilets"]

        # 5. M√î T·∫¢ (3 L·ªõp b·∫£o v·ªá)
        desc_text = None
        # L·ªõp 1: JSON Astro
        for astro in soup.find_all("astro-island"):
            props = astro.get("props")
            if props:
                try:
                    js = json.loads(html.unescape(props))
                    if "article" in js and js["article"].get("description"):
                        desc_text = BeautifulSoup(js["article"]["description"], "html.parser").get_text(" ", strip=True)
                        break
                except: continue

        # L·ªõp 2: Selector DOM
        if not desc_text:
            dt = soup.select_one("div.article-description div.break-words, div.article-description")
            if dt: desc_text = dt.get_text(" ", strip=True)

        # L·ªõp 3: Xpath d·ª± ph√≤ng
        if not desc_text:
            try:
                de = driver.find_element(By.XPATH, "//*[contains(text(),'M√¥ t·∫£')]/following-sibling::div")
                desc_text = de.text.strip()
            except: pass

        data["Description"] = desc_text if desc_text else "N/A"

        # 6. Qu√©t ƒë·ªông b·∫£ng thu·ªôc t√≠nh (Property)
        prop_box = soup.find("div", id="property")
        if prop_box:
            for it in prop_box.find_all("div", class_="flex items-start"):
                ss = it.find_all("span")
                if len(ss) >= 2: data[ss[0].get_text(strip=True)] = ss[1].get_text(strip=True)

        pt = soup.find("p", string=re.compile("Ng√†y ƒëƒÉng"))
        if pt: data["Post_Time"] = pt.get_text().replace("Ng√†y ƒëƒÉng:", "").strip()

    except Exception as e:
        print(f"‚ùå L·ªói t·∫°i {url}: {e}")
    finally:
        driver.quit()
        with open(CRAWLED_LOG, "a") as f: f.write(f"{ad_id}\n")

    print(f"   [DONE] ID: {ad_id} | Area: {data['Area_m2']} | Dist: {data['District']}")
    sys.stdout.flush()
    return data

# ================= RUNNER (Gi·ªØ nguy√™n) =================
def main():
    if not os.path.exists(CRAWLED_LOG): open(CRAWLED_LOG, 'w').close()
    with open(CRAWLED_LOG, 'r') as f: crawled_ids = set(line.strip() for line in f)

    all_data = []
    if os.path.exists(OUTPUT_FINAL):
        all_data = pd.read_csv(OUTPUT_FINAL).to_dict('records')

    print(f"üöÄ RESUME: ƒê√£ c√≥ {len(all_data)} tin.")

    list_driver = init_driver()
    try:
        for p in range(START_PAGE, END_PAGE + 1):
            print(f"\n--- üåê Qu√©t Trang {p} ---")
            list_driver.get(BASE_LIST_URL.format(p))
            time.sleep(2)
            soup = BeautifulSoup(list_driver.page_source, "html.parser")

            new_links = []
            for a in soup.select("a[href]"):
                h = a["href"]
                if "ho-chi-minh" in h and re.search(r"/\d{6,}$", h):
                    url = "https://meeyland.com" + h
                    if extract_id(url) not in crawled_ids:
                        new_links.append((url, p))

            if new_links:
                with ThreadPoolExecutor(max_workers=MAX_WORKERS) as exe:
                    results = list(exe.map(lambda x: crawl_detail_task(*x), new_links))
                    all_data.extend(results)
                    for r in results: crawled_ids.add(r['id'])

                df = pd.DataFrame(all_data)
                f_cols = ["id", "Page", "Title", "Price_Raw", "Price_Billion", "Price_per_m2", "Area_m2", "District", "Address", "Bedrooms", "Toilets", "Post_Time", "Link", "Description"]
                d_cols = [c for c in df.columns if c not in f_cols]
                df[f_cols + d_cols].to_csv(OUTPUT_FINAL, index=False, encoding="utf-8-sig")
                print(f"üìä ƒê√£ l∆∞u Page {p}. T·ªïng: {len(df)} tin.")
            else:
                print(f"   (Trang {p} ƒë√£ xong)")

    except KeyboardInterrupt:
        print("\nüõë ƒêang l∆∞u d·ªØ li·ªáu v√† tho√°t...")
    finally:
        list_driver.quit()
        if all_data:
            pd.DataFrame(all_data).to_csv(OUTPUT_FINAL, index=False, encoding="utf-8-sig")
        print("üíæ Xong.")

if __name__ == "__main__":
    main()

üîç ƒêang kh·ªüi ƒë·ªông h·ªá th·ªëng...
üöÄ RESUME: ƒê√£ c√≥ 0 tin.

--- üåê Qu√©t Trang 1 ---
   [DONE] ID: 306101517 | Area: 63 | Dist: N/A
   [DONE] ID: 302949591 | Area: 81 | Dist: N/A
   [DONE] ID: 305956459 | Area: 60 | Dist: N/A
   [DONE] ID: 302468081 | Area: 21 | Dist: N/A   [DONE] ID: 305968718 | Area: 60 | Dist: N/A

   [DONE] ID: 305998588 | Area: 60 | Dist: N/A
   [DONE] ID: 305983313 | Area: 60 | Dist: N/A
   [DONE] ID: 305993136 | Area: 42 | Dist: N/A
   [DONE] ID: 306053186 | Area: 90 | Dist: N/A
   [DONE] ID: 303478510 | Area: 56 | Dist: N/A
   [DONE] ID: 305998788 | Area: 90.5 | Dist: N/A
   [DONE] ID: 306053290 | Area: 38.7 | Dist: N/A
   [DONE] ID: 306041863 | Area: 84 | Dist: N/A
   [DONE] ID: 306038311 | Area: 86 | Dist: N/A
   [DONE] ID: 306043309 | Area: 31.5 | Dist: N/A
   [DONE] ID: 306043399 | Area: 60 | Dist: N/A
   [DONE] ID: 306014284 | Area: 64.8 | Dist: N/A
   [DONE] ID: 306264274 | Area: 66 | Dist: N/A
   [DONE] ID: 306016854 | Area: 29.9 | Dist: N/

In [3]:
import pandas as pd
import os
import re
import time
import sys
import json
import html
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from concurrent.futures import ThreadPoolExecutor

# ================= CONFIG =================
BASE_LIST_URL = "https://meeyland.com/mua-ban-nha-dat-quan-4-ho-chi-minh-c423?page={}"
START_PAGE = 1
END_PAGE = 20
MAX_WORKERS = 5
OUTPUT_FINAL = "../../data/raw/quan4.csv"
CRAWLED_LOG = "../../data/raw/crawled_ids4.txt"

print("üîç ƒêang kh·ªüi ƒë·ªông h·ªá th·ªëng...")
DRIVER_PATH = ChromeDriverManager().install()

def init_driver():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1366,768")
    return webdriver.Chrome(service=Service(DRIVER_PATH), options=options)

# ================= UTILS =================
def extract_id(link):
    m = re.search(r"/(\d{6,})$", link)
    return m.group(1) if m else None

def price_to_billion(text):
    if not text: return None
    t = text.lower().replace(",", ".")
    m = re.search(r"([\d\.]+)", t)
    if m:
        val = float(m.group(1))
        return val if "t·ª∑" in t else val / 1000
    return None

def extract_district(address):
    if not address or address == "N/A": return "N/A"
    addr = address.lower()
    if "th·ªß ƒë·ª©c" in addr: return "TP Th·ªß ƒê·ª©c"
    m = re.search(r"(qu·∫≠n\s+\d+|q\.\d+|q\s+\d+|h\.\s+[a-z√†-·ªπ\s]+|huy·ªán\s+[a-z√†-·ªπ\s]+|qu·∫≠n\s+[a-z√†-·ªπ\s]+)", addr)
    if m:
        return m.group(1).replace("q.", "Qu·∫≠n ").replace("q", "Qu·∫≠n ").replace("h.", "Huy·ªán ").strip().title()
    return "N/A"

# ================= CORE TASK =================
def crawl_detail_task(url, page_num):
    ad_id = extract_id(url)
    driver = init_driver()
    data = {
        "id": ad_id, "Page": page_num, "Title": "N/A", "Price_Raw": "N/A",
        "Price_Billion": None, "Price_per_m2": "N/A", "Area_m2": None,
        "District": "N/A", "Address": "N/A", "Bedrooms": None, "Toilets": None,
        "Post_Time": "N/A", "Link": url, "Description": "N/A"
    }

    try:
        driver.get(url)
        # 1. Ch·ªù render v√† bung n·ªôi dung ·∫©n
        try:
            WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CLASS_NAME, "article-description")))
            driver.execute_script("""
                let btns = document.querySelectorAll('span');
                btns.forEach(b => { if(b.innerText.includes('Xem th√™m')) b.click(); });
            """)
            time.sleep(0.5)
        except: pass

        # 2. L·∫•y Address & District (Selector c·∫≠p nh·∫≠t)
        try:
            addr_el = driver.find_element(By.CSS_SELECTOR, "div.text-primary-600.line-clamp-1, div.text-fs-14.font-medium.text-primary-600")
            full_addr = driver.execute_script("return arguments[0].textContent;", addr_el).strip()
            data["Address"] = full_addr.replace("thu g·ªçn", "").strip()
            data["District"] = extract_district(data["Address"])
        except: pass

        soup = BeautifulSoup(driver.page_source, "html.parser")

        # 3. Ti√™u ƒë·ªÅ & Gi√° t·ªïng
        data["Title"] = soup.find("h1").get_text(strip=True) if soup.find("h1") else "N/A"
        p_tag = soup.select_one("h2.text-error-600")
        if p_tag:
            data["Price_Raw"] = p_tag.get_text(strip=True)
            data["Price_Billion"] = price_to_billion(data["Price_Raw"])

        # 4. X·ª¨ L√ù DI·ªÜN T√çCH (∆Øu ti√™n Tippy-content ƒë·ªÉ ch√≠nh x√°c 100%)
        # T√¨m div ch·ª©a icon di·ªán t√≠ch th√¥ng qua thu·ªôc t√≠nh data-tippy-content
        area_div = soup.find("div", attrs={"data-tippy-content": re.compile(r"Di·ªán t√≠ch", re.I)})
        if area_div:
            area_span = area_div.find("span", class_="text-fs-14")
            if area_span:
                v = re.search(r"([\d\.,]+)", area_span.get_text())
                if v: data["Area_m2"] = v.group(1).replace(",", ".")

        # Fallback: Qu√©t c√°c span n·∫øu Tippy th·∫•t b·∫°i
        for sp in soup.select("span.text-fs-14"):
            t = sp.get_text(strip=True).lower()
            if ("m2" in t or "m¬≤" in t) and "/" not in t and data["Area_m2"] is None:
                v = re.search(r"([\d\.,]+)", t)
                if v: data["Area_m2"] = v.group(1).replace(",", ".")
            elif ("m2" in t or "m¬≤" in t) and "/" in t:
                data["Price_per_m2"] = t

            if "pn" in t:
                v = re.search(r"\d+", t); data["Bedrooms"] = v.group() if v else data["Bedrooms"]
            if "wc" in t:
                v = re.search(r"\d+", t); data["Toilets"] = v.group() if v else data["Toilets"]

        # 5. M√î T·∫¢ (3 L·ªõp b·∫£o v·ªá)
        desc_text = None
        # L·ªõp 1: JSON Astro
        for astro in soup.find_all("astro-island"):
            props = astro.get("props")
            if props:
                try:
                    js = json.loads(html.unescape(props))
                    if "article" in js and js["article"].get("description"):
                        desc_text = BeautifulSoup(js["article"]["description"], "html.parser").get_text(" ", strip=True)
                        break
                except: continue

        # L·ªõp 2: Selector DOM
        if not desc_text:
            dt = soup.select_one("div.article-description div.break-words, div.article-description")
            if dt: desc_text = dt.get_text(" ", strip=True)

        # L·ªõp 3: Xpath d·ª± ph√≤ng
        if not desc_text:
            try:
                de = driver.find_element(By.XPATH, "//*[contains(text(),'M√¥ t·∫£')]/following-sibling::div")
                desc_text = de.text.strip()
            except: pass

        data["Description"] = desc_text if desc_text else "N/A"

        # 6. Qu√©t ƒë·ªông b·∫£ng thu·ªôc t√≠nh (Property)
        prop_box = soup.find("div", id="property")
        if prop_box:
            for it in prop_box.find_all("div", class_="flex items-start"):
                ss = it.find_all("span")
                if len(ss) >= 2: data[ss[0].get_text(strip=True)] = ss[1].get_text(strip=True)

        pt = soup.find("p", string=re.compile("Ng√†y ƒëƒÉng"))
        if pt: data["Post_Time"] = pt.get_text().replace("Ng√†y ƒëƒÉng:", "").strip()

    except Exception as e:
        print(f"‚ùå L·ªói t·∫°i {url}: {e}")
    finally:
        driver.quit()
        with open(CRAWLED_LOG, "a") as f: f.write(f"{ad_id}\n")

    print(f"   [DONE] ID: {ad_id} | Area: {data['Area_m2']} | Dist: {data['District']}")
    sys.stdout.flush()
    return data

# ================= RUNNER (Gi·ªØ nguy√™n) =================
def main():
    if not os.path.exists(CRAWLED_LOG): open(CRAWLED_LOG, 'w').close()
    with open(CRAWLED_LOG, 'r') as f: crawled_ids = set(line.strip() for line in f)

    all_data = []
    if os.path.exists(OUTPUT_FINAL):
        all_data = pd.read_csv(OUTPUT_FINAL).to_dict('records')

    print(f"üöÄ RESUME: ƒê√£ c√≥ {len(all_data)} tin.")

    list_driver = init_driver()
    try:
        for p in range(START_PAGE, END_PAGE + 1):
            print(f"\n--- üåê Qu√©t Trang {p} ---")
            list_driver.get(BASE_LIST_URL.format(p))
            time.sleep(2)
            soup = BeautifulSoup(list_driver.page_source, "html.parser")

            new_links = []
            for a in soup.select("a[href]"):
                h = a["href"]
                if "ho-chi-minh" in h and re.search(r"/\d{6,}$", h):
                    url = "https://meeyland.com" + h
                    if extract_id(url) not in crawled_ids:
                        new_links.append((url, p))

            if new_links:
                with ThreadPoolExecutor(max_workers=MAX_WORKERS) as exe:
                    results = list(exe.map(lambda x: crawl_detail_task(*x), new_links))
                    all_data.extend(results)
                    for r in results: crawled_ids.add(r['id'])

                df = pd.DataFrame(all_data)
                f_cols = ["id", "Page", "Title", "Price_Raw", "Price_Billion", "Price_per_m2", "Area_m2", "District", "Address", "Bedrooms", "Toilets", "Post_Time", "Link", "Description"]
                d_cols = [c for c in df.columns if c not in f_cols]
                df[f_cols + d_cols].to_csv(OUTPUT_FINAL, index=False, encoding="utf-8-sig")
                print(f"üìä ƒê√£ l∆∞u Page {p}. T·ªïng: {len(df)} tin.")
            else:
                print(f"   (Trang {p} ƒë√£ xong)")

    except KeyboardInterrupt:
        print("\nüõë ƒêang l∆∞u d·ªØ li·ªáu v√† tho√°t...")
    finally:
        list_driver.quit()
        if all_data:
            pd.DataFrame(all_data).to_csv(OUTPUT_FINAL, index=False, encoding="utf-8-sig")
        print("üíæ Xong.")

if __name__ == "__main__":
    main()

üîç ƒêang kh·ªüi ƒë·ªông h·ªá th·ªëng...
üöÄ RESUME: ƒê√£ c√≥ 0 tin.

--- üåê Qu√©t Trang 1 ---
   [DONE] ID: 306304969 | Area: 36 | Dist: N/A
   [DONE] ID: 306316580 | Area: 45 | Dist: N/A
   [DONE] ID: 105658348 | Area: 277 | Dist: N/A
   [DONE] ID: 306031048 | Area: 22.2 | Dist: N/A
   [DONE] ID: 304760587 | Area: 60.3 | Dist: N/A
   [DONE] ID: 305927397 | Area: 45 | Dist: N/A
   [DONE] ID: 305996311 | Area: 20 | Dist: N/A
   [DONE] ID: 306060382 | Area: 84 | Dist: N/A
   [DONE] ID: 305998797 | Area: 36.6 | Dist: N/A
   [DONE] ID: 306005353 | Area: 40 | Dist: N/A
   [DONE] ID: 306022983 | Area: 64 | Dist: N/A
   [DONE] ID: 306249205 | Area: 91 | Dist: N/A
   [DONE] ID: 306069563 | Area: 46.6 | Dist: N/A
   [DONE] ID: 306062638 | Area: 40 | Dist: N/A
   [DONE] ID: 306228501 | Area: 65 | Dist: N/A
   [DONE] ID: 306049697 | Area: 31.5 | Dist: N/A
   [DONE] ID: 306049678 | Area: 84 | Dist: N/A
   [DONE] ID: 306208629 | Area: 44 | Dist: N/A
   [DONE] ID: 306030136 | Area: 22.6 | Dist:

qu·∫≠n 5

In [4]:
import pandas as pd
import os
import re
import time
import sys
import json
import html
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from concurrent.futures import ThreadPoolExecutor

# ================= CONFIG =================
BASE_LIST_URL = "https://meeyland.com/mua-ban-nha-dat-quan-5-ho-chi-minh-c423?page={}"
START_PAGE = 1
END_PAGE = 20
MAX_WORKERS = 5
OUTPUT_FINAL = "../../data/raw/quan5.csv"
CRAWLED_LOG = "../../data/raw/crawled_ids5.txt"

print("üîç ƒêang kh·ªüi ƒë·ªông h·ªá th·ªëng...")
DRIVER_PATH = ChromeDriverManager().install()

def init_driver():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1366,768")
    return webdriver.Chrome(service=Service(DRIVER_PATH), options=options)

# ================= UTILS =================
def extract_id(link):
    m = re.search(r"/(\d{6,})$", link)
    return m.group(1) if m else None

def price_to_billion(text):
    if not text: return None
    t = text.lower().replace(",", ".")
    m = re.search(r"([\d\.]+)", t)
    if m:
        val = float(m.group(1))
        return val if "t·ª∑" in t else val / 1000
    return None

def extract_district(address):
    if not address or address == "N/A": return "N/A"
    addr = address.lower()
    if "th·ªß ƒë·ª©c" in addr: return "TP Th·ªß ƒê·ª©c"
    m = re.search(r"(qu·∫≠n\s+\d+|q\.\d+|q\s+\d+|h\.\s+[a-z√†-·ªπ\s]+|huy·ªán\s+[a-z√†-·ªπ\s]+|qu·∫≠n\s+[a-z√†-·ªπ\s]+)", addr)
    if m:
        return m.group(1).replace("q.", "Qu·∫≠n ").replace("q", "Qu·∫≠n ").replace("h.", "Huy·ªán ").strip().title()
    return "N/A"

# ================= CORE TASK =================
def crawl_detail_task(url, page_num):
    ad_id = extract_id(url)
    driver = init_driver()
    data = {
        "id": ad_id, "Page": page_num, "Title": "N/A", "Price_Raw": "N/A",
        "Price_Billion": None, "Price_per_m2": "N/A", "Area_m2": None,
        "District": "N/A", "Address": "N/A", "Bedrooms": None, "Toilets": None,
        "Post_Time": "N/A", "Link": url, "Description": "N/A"
    }

    try:
        driver.get(url)
        # 1. Ch·ªù render v√† bung n·ªôi dung ·∫©n
        try:
            WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CLASS_NAME, "article-description")))
            driver.execute_script("""
                let btns = document.querySelectorAll('span');
                btns.forEach(b => { if(b.innerText.includes('Xem th√™m')) b.click(); });
            """)
            time.sleep(0.5)
        except: pass

        # 2. L·∫•y Address & District (Selector c·∫≠p nh·∫≠t)
        try:
            addr_el = driver.find_element(By.CSS_SELECTOR, "div.text-primary-600.line-clamp-1, div.text-fs-14.font-medium.text-primary-600")
            full_addr = driver.execute_script("return arguments[0].textContent;", addr_el).strip()
            data["Address"] = full_addr.replace("thu g·ªçn", "").strip()
            data["District"] = extract_district(data["Address"])
        except: pass

        soup = BeautifulSoup(driver.page_source, "html.parser")

        # 3. Ti√™u ƒë·ªÅ & Gi√° t·ªïng
        data["Title"] = soup.find("h1").get_text(strip=True) if soup.find("h1") else "N/A"
        p_tag = soup.select_one("h2.text-error-600")
        if p_tag:
            data["Price_Raw"] = p_tag.get_text(strip=True)
            data["Price_Billion"] = price_to_billion(data["Price_Raw"])

        # 4. X·ª¨ L√ù DI·ªÜN T√çCH (∆Øu ti√™n Tippy-content ƒë·ªÉ ch√≠nh x√°c 100%)
        # T√¨m div ch·ª©a icon di·ªán t√≠ch th√¥ng qua thu·ªôc t√≠nh data-tippy-content
        area_div = soup.find("div", attrs={"data-tippy-content": re.compile(r"Di·ªán t√≠ch", re.I)})
        if area_div:
            area_span = area_div.find("span", class_="text-fs-14")
            if area_span:
                v = re.search(r"([\d\.,]+)", area_span.get_text())
                if v: data["Area_m2"] = v.group(1).replace(",", ".")

        # Fallback: Qu√©t c√°c span n·∫øu Tippy th·∫•t b·∫°i
        for sp in soup.select("span.text-fs-14"):
            t = sp.get_text(strip=True).lower()
            if ("m2" in t or "m¬≤" in t) and "/" not in t and data["Area_m2"] is None:
                v = re.search(r"([\d\.,]+)", t)
                if v: data["Area_m2"] = v.group(1).replace(",", ".")
            elif ("m2" in t or "m¬≤" in t) and "/" in t:
                data["Price_per_m2"] = t

            if "pn" in t:
                v = re.search(r"\d+", t); data["Bedrooms"] = v.group() if v else data["Bedrooms"]
            if "wc" in t:
                v = re.search(r"\d+", t); data["Toilets"] = v.group() if v else data["Toilets"]

        # 5. M√î T·∫¢ (3 L·ªõp b·∫£o v·ªá)
        desc_text = None
        # L·ªõp 1: JSON Astro
        for astro in soup.find_all("astro-island"):
            props = astro.get("props")
            if props:
                try:
                    js = json.loads(html.unescape(props))
                    if "article" in js and js["article"].get("description"):
                        desc_text = BeautifulSoup(js["article"]["description"], "html.parser").get_text(" ", strip=True)
                        break
                except: continue

        # L·ªõp 2: Selector DOM
        if not desc_text:
            dt = soup.select_one("div.article-description div.break-words, div.article-description")
            if dt: desc_text = dt.get_text(" ", strip=True)

        # L·ªõp 3: Xpath d·ª± ph√≤ng
        if not desc_text:
            try:
                de = driver.find_element(By.XPATH, "//*[contains(text(),'M√¥ t·∫£')]/following-sibling::div")
                desc_text = de.text.strip()
            except: pass

        data["Description"] = desc_text if desc_text else "N/A"

        # 6. Qu√©t ƒë·ªông b·∫£ng thu·ªôc t√≠nh (Property)
        prop_box = soup.find("div", id="property")
        if prop_box:
            for it in prop_box.find_all("div", class_="flex items-start"):
                ss = it.find_all("span")
                if len(ss) >= 2: data[ss[0].get_text(strip=True)] = ss[1].get_text(strip=True)

        pt = soup.find("p", string=re.compile("Ng√†y ƒëƒÉng"))
        if pt: data["Post_Time"] = pt.get_text().replace("Ng√†y ƒëƒÉng:", "").strip()

    except Exception as e:
        print(f"‚ùå L·ªói t·∫°i {url}: {e}")
    finally:
        driver.quit()
        with open(CRAWLED_LOG, "a") as f: f.write(f"{ad_id}\n")

    print(f"   [DONE] ID: {ad_id} | Area: {data['Area_m2']} | Dist: {data['District']}")
    sys.stdout.flush()
    return data

# ================= RUNNER (Gi·ªØ nguy√™n) =================
def main():
    if not os.path.exists(CRAWLED_LOG): open(CRAWLED_LOG, 'w').close()
    with open(CRAWLED_LOG, 'r') as f: crawled_ids = set(line.strip() for line in f)

    all_data = []
    if os.path.exists(OUTPUT_FINAL):
        all_data = pd.read_csv(OUTPUT_FINAL).to_dict('records')

    print(f"üöÄ RESUME: ƒê√£ c√≥ {len(all_data)} tin.")

    list_driver = init_driver()
    try:
        for p in range(START_PAGE, END_PAGE + 1):
            print(f"\n--- üåê Qu√©t Trang {p} ---")
            list_driver.get(BASE_LIST_URL.format(p))
            time.sleep(2)
            soup = BeautifulSoup(list_driver.page_source, "html.parser")

            new_links = []
            for a in soup.select("a[href]"):
                h = a["href"]
                if "ho-chi-minh" in h and re.search(r"/\d{6,}$", h):
                    url = "https://meeyland.com" + h
                    if extract_id(url) not in crawled_ids:
                        new_links.append((url, p))

            if new_links:
                with ThreadPoolExecutor(max_workers=MAX_WORKERS) as exe:
                    results = list(exe.map(lambda x: crawl_detail_task(*x), new_links))
                    all_data.extend(results)
                    for r in results: crawled_ids.add(r['id'])

                df = pd.DataFrame(all_data)
                f_cols = ["id", "Page", "Title", "Price_Raw", "Price_Billion", "Price_per_m2", "Area_m2", "District", "Address", "Bedrooms", "Toilets", "Post_Time", "Link", "Description"]
                d_cols = [c for c in df.columns if c not in f_cols]
                df[f_cols + d_cols].to_csv(OUTPUT_FINAL, index=False, encoding="utf-8-sig")
                print(f"üìä ƒê√£ l∆∞u Page {p}. T·ªïng: {len(df)} tin.")
            else:
                print(f"   (Trang {p} ƒë√£ xong)")

    except KeyboardInterrupt:
        print("\nüõë ƒêang l∆∞u d·ªØ li·ªáu v√† tho√°t...")
    finally:
        list_driver.quit()
        if all_data:
            pd.DataFrame(all_data).to_csv(OUTPUT_FINAL, index=False, encoding="utf-8-sig")
        print("üíæ Xong.")

if __name__ == "__main__":
    main()

üîç ƒêang kh·ªüi ƒë·ªông h·ªá th·ªëng...
üöÄ RESUME: ƒê√£ c√≥ 0 tin.

--- üåê Qu√©t Trang 1 ---
   [DONE] ID: 304968743 | Area: 64 | Dist: N/A
   [DONE] ID: 305692543 | Area: 48 | Dist: N/A
   [DONE] ID: 305956515 | Area: 60 | Dist: N/A
   [DONE] ID: 306347404 | Area: 39 | Dist: N/A
   [DONE] ID: 303535973 | Area: 44 | Dist: N/A
   [DONE] ID: 306304982 | Area: 23 | Dist: N/A
   [DONE] ID: 305927385 | Area: 40 | Dist: N/A
   [DONE] ID: 306007029 | Area: 45 | Dist: N/A
   [DONE] ID: 305969829 | Area: 100 | Dist: N/A
   [DONE] ID: 306324127 | Area: 29.2 | Dist: N/A
   [DONE] ID: 306259824 | Area: 107 | Dist: N/A
   [DONE] ID: 305993437 | Area: 60 | Dist: N/A
   [DONE] ID: 306054829 | Area: 28.5 | Dist: N/A
   [DONE] ID: 306309421 | Area: 42 | Dist: N/A
   [DONE] ID: 105416876 | Area: 80 | Dist: N/A
   [DONE] ID: 304994749 | Area: 52 | Dist: N/A   [DONE] ID: 305996029 | Area: 85 | Dist: N/A

   [DONE] ID: 306038411 | Area: 57 | Dist: N/A
   [DONE] ID: 305258876 | Area: 160 | Dist: N/A
 

qu·∫≠n 6

In [5]:
import pandas as pd
import os
import re
import time
import sys
import json
import html
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from concurrent.futures import ThreadPoolExecutor

# ================= CONFIG =================
BASE_LIST_URL = "https://meeyland.com/mua-ban-nha-dat-quan-6-ho-chi-minh-c423?page={}"
START_PAGE = 1
END_PAGE = 20
MAX_WORKERS = 5
OUTPUT_FINAL = "../../data/raw/quan6.csv"
CRAWLED_LOG = "../../data/raw/crawled_ids6.txt"

print("üîç ƒêang kh·ªüi ƒë·ªông h·ªá th·ªëng...")
DRIVER_PATH = ChromeDriverManager().install()

def init_driver():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1366,768")
    return webdriver.Chrome(service=Service(DRIVER_PATH), options=options)

# ================= UTILS =================
def extract_id(link):
    m = re.search(r"/(\d{6,})$", link)
    return m.group(1) if m else None

def price_to_billion(text):
    if not text: return None
    t = text.lower().replace(",", ".")
    m = re.search(r"([\d\.]+)", t)
    if m:
        val = float(m.group(1))
        return val if "t·ª∑" in t else val / 1000
    return None

def extract_district(address):
    if not address or address == "N/A": return "N/A"
    addr = address.lower()
    if "th·ªß ƒë·ª©c" in addr: return "TP Th·ªß ƒê·ª©c"
    m = re.search(r"(qu·∫≠n\s+\d+|q\.\d+|q\s+\d+|h\.\s+[a-z√†-·ªπ\s]+|huy·ªán\s+[a-z√†-·ªπ\s]+|qu·∫≠n\s+[a-z√†-·ªπ\s]+)", addr)
    if m:
        return m.group(1).replace("q.", "Qu·∫≠n ").replace("q", "Qu·∫≠n ").replace("h.", "Huy·ªán ").strip().title()
    return "N/A"

# ================= CORE TASK =================
def crawl_detail_task(url, page_num):
    ad_id = extract_id(url)
    driver = init_driver()
    data = {
        "id": ad_id, "Page": page_num, "Title": "N/A", "Price_Raw": "N/A",
        "Price_Billion": None, "Price_per_m2": "N/A", "Area_m2": None,
        "District": "N/A", "Address": "N/A", "Bedrooms": None, "Toilets": None,
        "Post_Time": "N/A", "Link": url, "Description": "N/A"
    }

    try:
        driver.get(url)
        # 1. Ch·ªù render v√† bung n·ªôi dung ·∫©n
        try:
            WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CLASS_NAME, "article-description")))
            driver.execute_script("""
                let btns = document.querySelectorAll('span');
                btns.forEach(b => { if(b.innerText.includes('Xem th√™m')) b.click(); });
            """)
            time.sleep(0.5)
        except: pass

        # 2. L·∫•y Address & District (Selector c·∫≠p nh·∫≠t)
        try:
            addr_el = driver.find_element(By.CSS_SELECTOR, "div.text-primary-600.line-clamp-1, div.text-fs-14.font-medium.text-primary-600")
            full_addr = driver.execute_script("return arguments[0].textContent;", addr_el).strip()
            data["Address"] = full_addr.replace("thu g·ªçn", "").strip()
            data["District"] = extract_district(data["Address"])
        except: pass

        soup = BeautifulSoup(driver.page_source, "html.parser")

        # 3. Ti√™u ƒë·ªÅ & Gi√° t·ªïng
        data["Title"] = soup.find("h1").get_text(strip=True) if soup.find("h1") else "N/A"
        p_tag = soup.select_one("h2.text-error-600")
        if p_tag:
            data["Price_Raw"] = p_tag.get_text(strip=True)
            data["Price_Billion"] = price_to_billion(data["Price_Raw"])

        # 4. X·ª¨ L√ù DI·ªÜN T√çCH (∆Øu ti√™n Tippy-content ƒë·ªÉ ch√≠nh x√°c 100%)
        # T√¨m div ch·ª©a icon di·ªán t√≠ch th√¥ng qua thu·ªôc t√≠nh data-tippy-content
        area_div = soup.find("div", attrs={"data-tippy-content": re.compile(r"Di·ªán t√≠ch", re.I)})
        if area_div:
            area_span = area_div.find("span", class_="text-fs-14")
            if area_span:
                v = re.search(r"([\d\.,]+)", area_span.get_text())
                if v: data["Area_m2"] = v.group(1).replace(",", ".")

        # Fallback: Qu√©t c√°c span n·∫øu Tippy th·∫•t b·∫°i
        for sp in soup.select("span.text-fs-14"):
            t = sp.get_text(strip=True).lower()
            if ("m2" in t or "m¬≤" in t) and "/" not in t and data["Area_m2"] is None:
                v = re.search(r"([\d\.,]+)", t)
                if v: data["Area_m2"] = v.group(1).replace(",", ".")
            elif ("m2" in t or "m¬≤" in t) and "/" in t:
                data["Price_per_m2"] = t

            if "pn" in t:
                v = re.search(r"\d+", t); data["Bedrooms"] = v.group() if v else data["Bedrooms"]
            if "wc" in t:
                v = re.search(r"\d+", t); data["Toilets"] = v.group() if v else data["Toilets"]

        # 5. M√î T·∫¢ (3 L·ªõp b·∫£o v·ªá)
        desc_text = None
        # L·ªõp 1: JSON Astro
        for astro in soup.find_all("astro-island"):
            props = astro.get("props")
            if props:
                try:
                    js = json.loads(html.unescape(props))
                    if "article" in js and js["article"].get("description"):
                        desc_text = BeautifulSoup(js["article"]["description"], "html.parser").get_text(" ", strip=True)
                        break
                except: continue

        # L·ªõp 2: Selector DOM
        if not desc_text:
            dt = soup.select_one("div.article-description div.break-words, div.article-description")
            if dt: desc_text = dt.get_text(" ", strip=True)

        # L·ªõp 3: Xpath d·ª± ph√≤ng
        if not desc_text:
            try:
                de = driver.find_element(By.XPATH, "//*[contains(text(),'M√¥ t·∫£')]/following-sibling::div")
                desc_text = de.text.strip()
            except: pass

        data["Description"] = desc_text if desc_text else "N/A"

        # 6. Qu√©t ƒë·ªông b·∫£ng thu·ªôc t√≠nh (Property)
        prop_box = soup.find("div", id="property")
        if prop_box:
            for it in prop_box.find_all("div", class_="flex items-start"):
                ss = it.find_all("span")
                if len(ss) >= 2: data[ss[0].get_text(strip=True)] = ss[1].get_text(strip=True)

        pt = soup.find("p", string=re.compile("Ng√†y ƒëƒÉng"))
        if pt: data["Post_Time"] = pt.get_text().replace("Ng√†y ƒëƒÉng:", "").strip()

    except Exception as e:
        print(f"‚ùå L·ªói t·∫°i {url}: {e}")
    finally:
        driver.quit()
        with open(CRAWLED_LOG, "a") as f: f.write(f"{ad_id}\n")

    print(f"   [DONE] ID: {ad_id} | Area: {data['Area_m2']} | Dist: {data['District']}")
    sys.stdout.flush()
    return data

# ================= RUNNER (Gi·ªØ nguy√™n) =================
def main():
    if not os.path.exists(CRAWLED_LOG): open(CRAWLED_LOG, 'w').close()
    with open(CRAWLED_LOG, 'r') as f: crawled_ids = set(line.strip() for line in f)

    all_data = []
    if os.path.exists(OUTPUT_FINAL):
        all_data = pd.read_csv(OUTPUT_FINAL).to_dict('records')

    print(f"üöÄ RESUME: ƒê√£ c√≥ {len(all_data)} tin.")

    list_driver = init_driver()
    try:
        for p in range(START_PAGE, END_PAGE + 1):
            print(f"\n--- üåê Qu√©t Trang {p} ---")
            list_driver.get(BASE_LIST_URL.format(p))
            time.sleep(2)
            soup = BeautifulSoup(list_driver.page_source, "html.parser")

            new_links = []
            for a in soup.select("a[href]"):
                h = a["href"]
                if "ho-chi-minh" in h and re.search(r"/\d{6,}$", h):
                    url = "https://meeyland.com" + h
                    if extract_id(url) not in crawled_ids:
                        new_links.append((url, p))

            if new_links:
                with ThreadPoolExecutor(max_workers=MAX_WORKERS) as exe:
                    results = list(exe.map(lambda x: crawl_detail_task(*x), new_links))
                    all_data.extend(results)
                    for r in results: crawled_ids.add(r['id'])

                df = pd.DataFrame(all_data)
                f_cols = ["id", "Page", "Title", "Price_Raw", "Price_Billion", "Price_per_m2", "Area_m2", "District", "Address", "Bedrooms", "Toilets", "Post_Time", "Link", "Description"]
                d_cols = [c for c in df.columns if c not in f_cols]
                df[f_cols + d_cols].to_csv(OUTPUT_FINAL, index=False, encoding="utf-8-sig")
                print(f"üìä ƒê√£ l∆∞u Page {p}. T·ªïng: {len(df)} tin.")
            else:
                print(f"   (Trang {p} ƒë√£ xong)")

    except KeyboardInterrupt:
        print("\nüõë ƒêang l∆∞u d·ªØ li·ªáu v√† tho√°t...")
    finally:
        list_driver.quit()
        if all_data:
            pd.DataFrame(all_data).to_csv(OUTPUT_FINAL, index=False, encoding="utf-8-sig")
        print("üíæ Xong.")

if __name__ == "__main__":
    main()

üîç ƒêang kh·ªüi ƒë·ªông h·ªá th·ªëng...
üöÄ RESUME: ƒê√£ c√≥ 0 tin.

--- üåê Qu√©t Trang 1 ---
   [DONE] ID: 306329680 | Area: 256.2 | Dist: N/A
   [DONE] ID: 306309699 | Area: 12 | Dist: N/A
   [DONE] ID: 306309407 | Area: 40.8 | Dist: N/A
   [DONE] ID: 306347386 | Area: 50 | Dist: N/A
   [DONE] ID: 104445264 | Area: 220 | Dist: N/A
   [DONE] ID: 303492018 | Area: 60 | Dist: N/A
   [DONE] ID: 303442078 | Area: 1.200 | Dist: N/A
   [DONE] ID: 305927526 | Area: 40 | Dist: N/A
   [DONE] ID: 301706125 | Area: 160 | Dist: N/A
   [DONE] ID: 303536161 | Area: 60 | Dist: N/A
   [DONE] ID: 105595772 | Area: 48 | Dist: N/A
   [DONE] ID: 305983744 | Area: 28.8 | Dist: N/A
   [DONE] ID: 105338934 | Area: 90 | Dist: N/A
   [DONE] ID: 305971008 | Area: 100.8 | Dist: N/A   [DONE] ID: 105338972 | Area: 21 | Dist: N/A

   [DONE] ID: 303352048 | Area: 68 | Dist: N/A
   [DONE] ID: 305965252 | Area: 39 | Dist: N/A
   [DONE] ID: 306035970 | Area: 59.4 | Dist: N/A
   [DONE] ID: 306304971 | Area: 49.7 |

c·∫ßn gi·ªù

In [6]:
import pandas as pd
import os
import re
import time
import sys
import json
import html
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from concurrent.futures import ThreadPoolExecutor

# ================= CONFIG =================
BASE_LIST_URL = "https://meeyland.com/mua-ban-nha-dat-can-gio-ho-chi-minh-c423?page={}"
START_PAGE = 1
END_PAGE = 20
MAX_WORKERS = 5
OUTPUT_FINAL = "../../data/raw/quancangio.csv"
CRAWLED_LOG = "../../data/raw/crawled_idscangio.txt"

print("üîç ƒêang kh·ªüi ƒë·ªông h·ªá th·ªëng...")
DRIVER_PATH = ChromeDriverManager().install()

def init_driver():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1366,768")
    return webdriver.Chrome(service=Service(DRIVER_PATH), options=options)

# ================= UTILS =================
def extract_id(link):
    m = re.search(r"/(\d{6,})$", link)
    return m.group(1) if m else None

def price_to_billion(text):
    if not text: return None
    t = text.lower().replace(",", ".")
    m = re.search(r"([\d\.]+)", t)
    if m:
        val = float(m.group(1))
        return val if "t·ª∑" in t else val / 1000
    return None

def extract_district(address):
    if not address or address == "N/A": return "N/A"
    addr = address.lower()
    if "th·ªß ƒë·ª©c" in addr: return "TP Th·ªß ƒê·ª©c"
    m = re.search(r"(qu·∫≠n\s+\d+|q\.\d+|q\s+\d+|h\.\s+[a-z√†-·ªπ\s]+|huy·ªán\s+[a-z√†-·ªπ\s]+|qu·∫≠n\s+[a-z√†-·ªπ\s]+)", addr)
    if m:
        return m.group(1).replace("q.", "Qu·∫≠n ").replace("q", "Qu·∫≠n ").replace("h.", "Huy·ªán ").strip().title()
    return "N/A"

# ================= CORE TASK =================
def crawl_detail_task(url, page_num):
    ad_id = extract_id(url)
    driver = init_driver()
    data = {
        "id": ad_id, "Page": page_num, "Title": "N/A", "Price_Raw": "N/A",
        "Price_Billion": None, "Price_per_m2": "N/A", "Area_m2": None,
        "District": "N/A", "Address": "N/A", "Bedrooms": None, "Toilets": None,
        "Post_Time": "N/A", "Link": url, "Description": "N/A"
    }

    try:
        driver.get(url)
        # 1. Ch·ªù render v√† bung n·ªôi dung ·∫©n
        try:
            WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CLASS_NAME, "article-description")))
            driver.execute_script("""
                let btns = document.querySelectorAll('span');
                btns.forEach(b => { if(b.innerText.includes('Xem th√™m')) b.click(); });
            """)
            time.sleep(0.5)
        except: pass

        # 2. L·∫•y Address & District (Selector c·∫≠p nh·∫≠t)
        try:
            addr_el = driver.find_element(By.CSS_SELECTOR, "div.text-primary-600.line-clamp-1, div.text-fs-14.font-medium.text-primary-600")
            full_addr = driver.execute_script("return arguments[0].textContent;", addr_el).strip()
            data["Address"] = full_addr.replace("thu g·ªçn", "").strip()
            data["District"] = extract_district(data["Address"])
        except: pass

        soup = BeautifulSoup(driver.page_source, "html.parser")

        # 3. Ti√™u ƒë·ªÅ & Gi√° t·ªïng
        data["Title"] = soup.find("h1").get_text(strip=True) if soup.find("h1") else "N/A"
        p_tag = soup.select_one("h2.text-error-600")
        if p_tag:
            data["Price_Raw"] = p_tag.get_text(strip=True)
            data["Price_Billion"] = price_to_billion(data["Price_Raw"])

        # 4. X·ª¨ L√ù DI·ªÜN T√çCH (∆Øu ti√™n Tippy-content ƒë·ªÉ ch√≠nh x√°c 100%)
        # T√¨m div ch·ª©a icon di·ªán t√≠ch th√¥ng qua thu·ªôc t√≠nh data-tippy-content
        area_div = soup.find("div", attrs={"data-tippy-content": re.compile(r"Di·ªán t√≠ch", re.I)})
        if area_div:
            area_span = area_div.find("span", class_="text-fs-14")
            if area_span:
                v = re.search(r"([\d\.,]+)", area_span.get_text())
                if v: data["Area_m2"] = v.group(1).replace(",", ".")

        # Fallback: Qu√©t c√°c span n·∫øu Tippy th·∫•t b·∫°i
        for sp in soup.select("span.text-fs-14"):
            t = sp.get_text(strip=True).lower()
            if ("m2" in t or "m¬≤" in t) and "/" not in t and data["Area_m2"] is None:
                v = re.search(r"([\d\.,]+)", t)
                if v: data["Area_m2"] = v.group(1).replace(",", ".")
            elif ("m2" in t or "m¬≤" in t) and "/" in t:
                data["Price_per_m2"] = t

            if "pn" in t:
                v = re.search(r"\d+", t); data["Bedrooms"] = v.group() if v else data["Bedrooms"]
            if "wc" in t:
                v = re.search(r"\d+", t); data["Toilets"] = v.group() if v else data["Toilets"]

        # 5. M√î T·∫¢ (3 L·ªõp b·∫£o v·ªá)
        desc_text = None
        # L·ªõp 1: JSON Astro
        for astro in soup.find_all("astro-island"):
            props = astro.get("props")
            if props:
                try:
                    js = json.loads(html.unescape(props))
                    if "article" in js and js["article"].get("description"):
                        desc_text = BeautifulSoup(js["article"]["description"], "html.parser").get_text(" ", strip=True)
                        break
                except: continue

        # L·ªõp 2: Selector DOM
        if not desc_text:
            dt = soup.select_one("div.article-description div.break-words, div.article-description")
            if dt: desc_text = dt.get_text(" ", strip=True)

        # L·ªõp 3: Xpath d·ª± ph√≤ng
        if not desc_text:
            try:
                de = driver.find_element(By.XPATH, "//*[contains(text(),'M√¥ t·∫£')]/following-sibling::div")
                desc_text = de.text.strip()
            except: pass

        data["Description"] = desc_text if desc_text else "N/A"

        # 6. Qu√©t ƒë·ªông b·∫£ng thu·ªôc t√≠nh (Property)
        prop_box = soup.find("div", id="property")
        if prop_box:
            for it in prop_box.find_all("div", class_="flex items-start"):
                ss = it.find_all("span")
                if len(ss) >= 2: data[ss[0].get_text(strip=True)] = ss[1].get_text(strip=True)

        pt = soup.find("p", string=re.compile("Ng√†y ƒëƒÉng"))
        if pt: data["Post_Time"] = pt.get_text().replace("Ng√†y ƒëƒÉng:", "").strip()

    except Exception as e:
        print(f"‚ùå L·ªói t·∫°i {url}: {e}")
    finally:
        driver.quit()
        with open(CRAWLED_LOG, "a") as f: f.write(f"{ad_id}\n")

    print(f"   [DONE] ID: {ad_id} | Area: {data['Area_m2']} | Dist: {data['District']}")
    sys.stdout.flush()
    return data

# ================= RUNNER (Gi·ªØ nguy√™n) =================
def main():
    if not os.path.exists(CRAWLED_LOG): open(CRAWLED_LOG, 'w').close()
    with open(CRAWLED_LOG, 'r') as f: crawled_ids = set(line.strip() for line in f)

    all_data = []
    if os.path.exists(OUTPUT_FINAL):
        all_data = pd.read_csv(OUTPUT_FINAL).to_dict('records')

    print(f"üöÄ RESUME: ƒê√£ c√≥ {len(all_data)} tin.")

    list_driver = init_driver()
    try:
        for p in range(START_PAGE, END_PAGE + 1):
            print(f"\n--- üåê Qu√©t Trang {p} ---")
            list_driver.get(BASE_LIST_URL.format(p))
            time.sleep(2)
            soup = BeautifulSoup(list_driver.page_source, "html.parser")

            new_links = []
            for a in soup.select("a[href]"):
                h = a["href"]
                if "ho-chi-minh" in h and re.search(r"/\d{6,}$", h):
                    url = "https://meeyland.com" + h
                    if extract_id(url) not in crawled_ids:
                        new_links.append((url, p))

            if new_links:
                with ThreadPoolExecutor(max_workers=MAX_WORKERS) as exe:
                    results = list(exe.map(lambda x: crawl_detail_task(*x), new_links))
                    all_data.extend(results)
                    for r in results: crawled_ids.add(r['id'])

                df = pd.DataFrame(all_data)
                f_cols = ["id", "Page", "Title", "Price_Raw", "Price_Billion", "Price_per_m2", "Area_m2", "District", "Address", "Bedrooms", "Toilets", "Post_Time", "Link", "Description"]
                d_cols = [c for c in df.columns if c not in f_cols]
                df[f_cols + d_cols].to_csv(OUTPUT_FINAL, index=False, encoding="utf-8-sig")
                print(f"üìä ƒê√£ l∆∞u Page {p}. T·ªïng: {len(df)} tin.")
            else:
                print(f"   (Trang {p} ƒë√£ xong)")

    except KeyboardInterrupt:
        print("\nüõë ƒêang l∆∞u d·ªØ li·ªáu v√† tho√°t...")
    finally:
        list_driver.quit()
        if all_data:
            pd.DataFrame(all_data).to_csv(OUTPUT_FINAL, index=False, encoding="utf-8-sig")
        print("üíæ Xong.")

if __name__ == "__main__":
    main()

üîç ƒêang kh·ªüi ƒë·ªông h·ªá th·ªëng...
üöÄ RESUME: ƒê√£ c√≥ 0 tin.

--- üåê Qu√©t Trang 1 ---
   [DONE] ID: 305422881 | Area: 300 | Dist: Huy·ªán  C·∫ßn Gi·ªù
   [DONE] ID: 306066720 | Area: 180 | Dist: Huy·ªán  C·∫ßn Gi·ªù
   [DONE] ID: 305610113 | Area: 112 | Dist: Huy·ªán  C·∫ßn Gi·ªù
   [DONE] ID: 306042302 | Area: 520 | Dist: Huy·ªán  C·∫ßn Gi·ªù
   [DONE] ID: 306145808 | Area: 809.1 | Dist: Huy·ªán  C·∫ßn Gi·ªù
   [DONE] ID: 306071081 | Area: 205 | Dist: Huy·ªán  C·∫ßn Gi·ªù
   [DONE] ID: 305422895 | Area: 200 | Dist: Huy·ªán  C·∫ßn Gi·ªù
   [DONE] ID: 306348376 | Area: None | Dist: N/A
   [DONE] ID: 306348110 | Area: None | Dist: N/A
   [DONE] ID: 306334878 | Area: 1.098 | Dist: Huy·ªán  C·∫ßn Gi·ªù
   [DONE] ID: 306347488 | Area: None | Dist: N/A
   [DONE] ID: 306344463 | Area: 1.000 | Dist: Huy·ªán  C·∫ßn Gi·ªù
   [DONE] ID: 306346084 | Area: 977 | Dist: Huy·ªán  C·∫ßn Gi·ªù
   [DONE] ID: 306339227 | Area: None | Dist: N/A
   [DONE] ID: 306337500 | Area: None | Dist: N/A


c·ªß chi

In [7]:
import pandas as pd
import os
import re
import time
import sys
import json
import html
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from concurrent.futures import ThreadPoolExecutor

# ================= CONFIG =================
BASE_LIST_URL = "https://meeyland.com/mua-ban-nha-dat-cu-chi-ho-chi-minh-c423?page={}"
START_PAGE = 1
END_PAGE = 20
MAX_WORKERS = 5
OUTPUT_FINAL = "../../data/raw/quancuchi.csv"
CRAWLED_LOG = "../../data/raw/crawled_idscuchi.txt"

print("üîç ƒêang kh·ªüi ƒë·ªông h·ªá th·ªëng...")
DRIVER_PATH = ChromeDriverManager().install()

def init_driver():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1366,768")
    return webdriver.Chrome(service=Service(DRIVER_PATH), options=options)

# ================= UTILS =================
def extract_id(link):
    m = re.search(r"/(\d{6,})$", link)
    return m.group(1) if m else None

def price_to_billion(text):
    if not text: return None
    t = text.lower().replace(",", ".")
    m = re.search(r"([\d\.]+)", t)
    if m:
        val = float(m.group(1))
        return val if "t·ª∑" in t else val / 1000
    return None

def extract_district(address):
    if not address or address == "N/A": return "N/A"
    addr = address.lower()
    if "th·ªß ƒë·ª©c" in addr: return "TP Th·ªß ƒê·ª©c"
    m = re.search(r"(qu·∫≠n\s+\d+|q\.\d+|q\s+\d+|h\.\s+[a-z√†-·ªπ\s]+|huy·ªán\s+[a-z√†-·ªπ\s]+|qu·∫≠n\s+[a-z√†-·ªπ\s]+)", addr)
    if m:
        return m.group(1).replace("q.", "Qu·∫≠n ").replace("q", "Qu·∫≠n ").replace("h.", "Huy·ªán ").strip().title()
    return "N/A"

# ================= CORE TASK =================
def crawl_detail_task(url, page_num):
    ad_id = extract_id(url)
    driver = init_driver()
    data = {
        "id": ad_id, "Page": page_num, "Title": "N/A", "Price_Raw": "N/A",
        "Price_Billion": None, "Price_per_m2": "N/A", "Area_m2": None,
        "District": "N/A", "Address": "N/A", "Bedrooms": None, "Toilets": None,
        "Post_Time": "N/A", "Link": url, "Description": "N/A"
    }

    try:
        driver.get(url)
        # 1. Ch·ªù render v√† bung n·ªôi dung ·∫©n
        try:
            WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CLASS_NAME, "article-description")))
            driver.execute_script("""
                let btns = document.querySelectorAll('span');
                btns.forEach(b => { if(b.innerText.includes('Xem th√™m')) b.click(); });
            """)
            time.sleep(0.5)
        except: pass

        # 2. L·∫•y Address & District (Selector c·∫≠p nh·∫≠t)
        try:
            addr_el = driver.find_element(By.CSS_SELECTOR, "div.text-primary-600.line-clamp-1, div.text-fs-14.font-medium.text-primary-600")
            full_addr = driver.execute_script("return arguments[0].textContent;", addr_el).strip()
            data["Address"] = full_addr.replace("thu g·ªçn", "").strip()
            data["District"] = extract_district(data["Address"])
        except: pass

        soup = BeautifulSoup(driver.page_source, "html.parser")

        # 3. Ti√™u ƒë·ªÅ & Gi√° t·ªïng
        data["Title"] = soup.find("h1").get_text(strip=True) if soup.find("h1") else "N/A"
        p_tag = soup.select_one("h2.text-error-600")
        if p_tag:
            data["Price_Raw"] = p_tag.get_text(strip=True)
            data["Price_Billion"] = price_to_billion(data["Price_Raw"])

        # 4. X·ª¨ L√ù DI·ªÜN T√çCH (∆Øu ti√™n Tippy-content ƒë·ªÉ ch√≠nh x√°c 100%)
        # T√¨m div ch·ª©a icon di·ªán t√≠ch th√¥ng qua thu·ªôc t√≠nh data-tippy-content
        area_div = soup.find("div", attrs={"data-tippy-content": re.compile(r"Di·ªán t√≠ch", re.I)})
        if area_div:
            area_span = area_div.find("span", class_="text-fs-14")
            if area_span:
                v = re.search(r"([\d\.,]+)", area_span.get_text())
                if v: data["Area_m2"] = v.group(1).replace(",", ".")

        # Fallback: Qu√©t c√°c span n·∫øu Tippy th·∫•t b·∫°i
        for sp in soup.select("span.text-fs-14"):
            t = sp.get_text(strip=True).lower()
            if ("m2" in t or "m¬≤" in t) and "/" not in t and data["Area_m2"] is None:
                v = re.search(r"([\d\.,]+)", t)
                if v: data["Area_m2"] = v.group(1).replace(",", ".")
            elif ("m2" in t or "m¬≤" in t) and "/" in t:
                data["Price_per_m2"] = t

            if "pn" in t:
                v = re.search(r"\d+", t); data["Bedrooms"] = v.group() if v else data["Bedrooms"]
            if "wc" in t:
                v = re.search(r"\d+", t); data["Toilets"] = v.group() if v else data["Toilets"]

        # 5. M√î T·∫¢ (3 L·ªõp b·∫£o v·ªá)
        desc_text = None
        # L·ªõp 1: JSON Astro
        for astro in soup.find_all("astro-island"):
            props = astro.get("props")
            if props:
                try:
                    js = json.loads(html.unescape(props))
                    if "article" in js and js["article"].get("description"):
                        desc_text = BeautifulSoup(js["article"]["description"], "html.parser").get_text(" ", strip=True)
                        break
                except: continue

        # L·ªõp 2: Selector DOM
        if not desc_text:
            dt = soup.select_one("div.article-description div.break-words, div.article-description")
            if dt: desc_text = dt.get_text(" ", strip=True)

        # L·ªõp 3: Xpath d·ª± ph√≤ng
        if not desc_text:
            try:
                de = driver.find_element(By.XPATH, "//*[contains(text(),'M√¥ t·∫£')]/following-sibling::div")
                desc_text = de.text.strip()
            except: pass

        data["Description"] = desc_text if desc_text else "N/A"

        # 6. Qu√©t ƒë·ªông b·∫£ng thu·ªôc t√≠nh (Property)
        prop_box = soup.find("div", id="property")
        if prop_box:
            for it in prop_box.find_all("div", class_="flex items-start"):
                ss = it.find_all("span")
                if len(ss) >= 2: data[ss[0].get_text(strip=True)] = ss[1].get_text(strip=True)

        pt = soup.find("p", string=re.compile("Ng√†y ƒëƒÉng"))
        if pt: data["Post_Time"] = pt.get_text().replace("Ng√†y ƒëƒÉng:", "").strip()

    except Exception as e:
        print(f"‚ùå L·ªói t·∫°i {url}: {e}")
    finally:
        driver.quit()
        with open(CRAWLED_LOG, "a") as f: f.write(f"{ad_id}\n")

    print(f"   [DONE] ID: {ad_id} | Area: {data['Area_m2']} | Dist: {data['District']}")
    sys.stdout.flush()
    return data

# ================= RUNNER (Gi·ªØ nguy√™n) =================
def main():
    if not os.path.exists(CRAWLED_LOG): open(CRAWLED_LOG, 'w').close()
    with open(CRAWLED_LOG, 'r') as f: crawled_ids = set(line.strip() for line in f)

    all_data = []
    if os.path.exists(OUTPUT_FINAL):
        all_data = pd.read_csv(OUTPUT_FINAL).to_dict('records')

    print(f"üöÄ RESUME: ƒê√£ c√≥ {len(all_data)} tin.")

    list_driver = init_driver()
    try:
        for p in range(START_PAGE, END_PAGE + 1):
            print(f"\n--- üåê Qu√©t Trang {p} ---")
            list_driver.get(BASE_LIST_URL.format(p))
            time.sleep(2)
            soup = BeautifulSoup(list_driver.page_source, "html.parser")

            new_links = []
            for a in soup.select("a[href]"):
                h = a["href"]
                if "ho-chi-minh" in h and re.search(r"/\d{6,}$", h):
                    url = "https://meeyland.com" + h
                    if extract_id(url) not in crawled_ids:
                        new_links.append((url, p))

            if new_links:
                with ThreadPoolExecutor(max_workers=MAX_WORKERS) as exe:
                    results = list(exe.map(lambda x: crawl_detail_task(*x), new_links))
                    all_data.extend(results)
                    for r in results: crawled_ids.add(r['id'])

                df = pd.DataFrame(all_data)
                f_cols = ["id", "Page", "Title", "Price_Raw", "Price_Billion", "Price_per_m2", "Area_m2", "District", "Address", "Bedrooms", "Toilets", "Post_Time", "Link", "Description"]
                d_cols = [c for c in df.columns if c not in f_cols]
                df[f_cols + d_cols].to_csv(OUTPUT_FINAL, index=False, encoding="utf-8-sig")
                print(f"üìä ƒê√£ l∆∞u Page {p}. T·ªïng: {len(df)} tin.")
            else:
                print(f"   (Trang {p} ƒë√£ xong)")

    except KeyboardInterrupt:
        print("\nüõë ƒêang l∆∞u d·ªØ li·ªáu v√† tho√°t...")
    finally:
        list_driver.quit()
        if all_data:
            pd.DataFrame(all_data).to_csv(OUTPUT_FINAL, index=False, encoding="utf-8-sig")
        print("üíæ Xong.")

if __name__ == "__main__":
    main()

üîç ƒêang kh·ªüi ƒë·ªông h·ªá th·ªëng...
üöÄ RESUME: ƒê√£ c√≥ 0 tin.

--- üåê Qu√©t Trang 1 ---
   [DONE] ID: 105183447 | Area: 133 | Dist: Huy·ªán  C·ªß Chi
   [DONE] ID: 303590276 | Area: 167 | Dist: Huy·ªán  C·ªß Chi
   [DONE] ID: 305993439 | Area: 401.2 | Dist: Huy·ªán  C·ªß Chi
   [DONE] ID: 105157492 | Area: 108 | Dist: Huy·ªán  C·ªß Chi
   [DONE] ID: 306054444 | Area: 404.8 | Dist: Huy·ªán  C·ªß Chi
   [DONE] ID: 305974493 | Area: 203.45 | Dist: Huy·ªán  C·ªß Chi
   [DONE] ID: 306028327 | Area: 1.500 | Dist: Huy·ªán  C·ªß Chi
   [DONE] ID: 105241637 | Area: 133 | Dist: Huy·ªán  C·ªß Chi
   [DONE] ID: 305970963 | Area: 757 | Dist: Huy·ªán  C·ªß Chi
   [DONE] ID: 305974499 | Area: 1.430 | Dist: Huy·ªán  C·ªß Chi
   [DONE] ID: 305970827 | Area: 94.5 | Dist: Huy·ªán  C·ªß Chi
   [DONE] ID: 305993756 | Area: 812 | Dist: Huy·ªán  C·ªß Chi
   [DONE] ID: 305980972 | Area: 997 | Dist: Huy·ªán  C·ªß Chi
   [DONE] ID: 305993737 | Area: 1.581 | Dist: Huy·ªán  C·ªß Chi
   [DONE] ID: 30599