In [26]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import WebDriverException, TimeoutException
from datetime import datetime, timedelta
import time, random, csv, os, re, tempfile, shutil, unicodedata
import random as rnd

PROFILE_DIR   = r"C:\\selenium_profiles\\meeyland_profile_hcm"
BASE          = "https://meeyland.com/mua-ban-nha-dat-ho-chi-minh-b43?page={page}"
OUTPUT_FILE   = "meeyland_raw.csv"
CHECKPOINT    = "meeyland_checkpoint.txt"
WAIT_TIMEOUT  = 20
SAFETY_CAP    = 20000

HEADLESS      = False
CHROME_MAJOR  = 141
USE_TEMP_PROFILE = True
# =====================

os.makedirs(PROFILE_DIR, exist_ok=True)

def _make_options(tmp_profile_dir: str | None):
    opts = uc.ChromeOptions()
    if tmp_profile_dir:
        opts.add_argument(f"--user-data-dir={tmp_profile_dir}")
    else:
        opts.add_argument(f"--user-data-dir={PROFILE_DIR}")
    opts.add_argument("--disable-blink-features=AutomationControlled")
    opts.add_argument("--start-maximized")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("--no-sandbox")
    if HEADLESS:
        opts.add_argument("--headless=new")
        opts.add_argument("--window-size=1366,900")
    return opts

def init_driver():
    tmp_profile = tempfile.mkdtemp(prefix="meey_uc_") if USE_TEMP_PROFILE else None
    opts = _make_options(tmp_profile)
    last_err = None
    for attempt in range(1, 3 + 1):
        try:
            drv = uc.Chrome(options=opts, version_main=CHROME_MAJOR, use_subprocess=True, suppress_welcome=True)
            drv._tmp_profile_dir = tmp_profile
            return drv
        except WebDriverException as e:
            print(f"L·ªói kh·ªüi t·∫°o driver l·∫ßn {attempt}, th·ª≠ l·∫°i..."); last_err = e; time.sleep(2 * attempt)
    if tmp_profile: shutil.rmtree(tmp_profile, ignore_errors=True)
    raise last_err

def safe_quit(driver):
    try: driver.quit()
    except: pass
    tmp = getattr(driver, "_tmp_profile_dir", None)
    if tmp and os.path.isdir(tmp): shutil.rmtree(tmp, ignore_errors=True)

def read_checkpoint() -> int:
    if not os.path.exists(CHECKPOINT): return 1
    try:
        with open(CHECKPOINT, "r", encoding="utf-8") as f:
            return max(1, int(f.read().strip()))
    except:
        return 1

def write_checkpoint(page: int):
    with open(CHECKPOINT, "w", encoding="utf-8") as f:
        f.write(str(page))

def ensure_csv_header(path: str):
    if not os.path.exists(path) or os.path.getsize(path) == 0:
        with open(path, "w", encoding="utf-8-sig", newline="") as f:
            w = csv.writer(f)
            w.writerow(["Ti√™u ƒë·ªÅ", "Gi√°", "Di·ªán t√≠ch", "Gi√°/m2", "S·ªë ph√≤ng ng·ªß", "S·ªë ph√≤ng t·∫Øm", "ƒê·ªãa ch·ªâ", "Ng√†y ƒëƒÉng"])

def text_or_na(ele, by, sel):
    try:
        t = ele.find_element(by, sel).text.strip()
        return t if t else "N/A"
    except:
        return "N/A"

def vn_lower(s): 
    return unicodedata.normalize("NFC", (s or "")).lower()

def clean_spaces(s): 
    return re.sub(r"\s+", " ", (s or "").strip())

# ===== Regex th∆∞·ªùng d√πng =====
RE_AREA      = re.compile(r"\b(\d+(?:[\.,]\d+)*)\s*m¬≤\b", re.I)
RE_PN        = re.compile(r"(\d+)\s*(?:pn|ph√≤ng ng·ªß)\b", re.I)
RE_WC        = re.compile(r"(\d+)\s*(?:wc|ph√≤ng t·∫Øm|toilet)\b", re.I)
RE_GIAm2     = re.compile(r"([\d.,]+\s*tr/m¬≤)", re.I)

# ===== Chu·∫©n ho√° ng√†y ƒëƒÉng =====
def parse_date_label(label):
    """Chu·∫©n ho√° ng√†y ƒëƒÉng:
       - H√¥m nay / H√¥m qua
       - X gi√¢y/ph√∫t/ti·∫øng|gi·ªù/ng√†y/tu·∫ßn/th√°ng tr∆∞·ªõc  -> dd/MM/YYYY
    """
    label = (label or "").strip()
    if not label:
        return "N/A"

    low = vn_lower(label)
    now = datetime.now()

    if "h√¥m nay" in low:
        return now.strftime("%d/%m/%Y")
    if "h√¥m qua" in low:
        return (now - timedelta(days=1)).strftime("%d/%m/%Y")

    m = re.search(r"(\d+)\s+(gi√¢y|ph√∫t|ti·∫øng|gi·ªù|ng√†y|tu·∫ßn|th√°ng)\s+tr∆∞·ªõc", low)
    if m:
        n = int(m.group(1))
        unit = m.group(2)
        if unit == "gi√¢y":
            dt = now - timedelta(seconds=n)
        elif unit == "ph√∫t":
            dt = now - timedelta(minutes=n)
        elif unit in ("ti·∫øng", "gi·ªù"):
            dt = now - timedelta(hours=n)
        elif unit == "ng√†y":
            dt = now - timedelta(days=n)
        elif unit == "tu·∫ßn":
            dt = now - timedelta(weeks=n)
        else:  # 'th√°ng' ~ 30 ng√†y
            dt = now - timedelta(days=30 * n)
        return dt.strftime("%d/%m/%Y")

    # Kh√¥ng kh·ªõp m·∫´u n√†o -> gi·ªØ nguy√™n
    return label

# ====== L·∫•y danh s√°ch card ======
CARD_SELECTOR = "article.relative"

def get_cards(driver, wait):
    try:
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, CARD_SELECTOR)))
    except TimeoutException:
        return []
    for _ in range(3):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)
    return driver.find_elements(By.CSS_SELECTOR, CARD_SELECTOR)

# ====== C√°c field ======
def get_title(card):
    # Ti√™u ƒë·ªÅ n·∫±m trong H3 c√≥ class line-clamp/typography
    for xp in [
        ".//h3[contains(@class,'line-clamp')]",
        ".//h3[contains(@class,'meey-sub') or contains(@class,'font-medium') or contains(@class,'text-fs-14')]",
    ]:
        try:
            e = card.find_element(By.XPATH, xp)
            t = (e.get_attribute("textContent") or e.text or "").strip()
            if t:
                return clean_spaces(t)
        except:
            pass
    return "N/A"

def _info_row(card):
    """
    T√¨m h√†ng info (ch·ª©a c√°c √¥: tr/m¬≤, m¬≤, PN, WC).
    Kh√¥ng b√°m class c·ª©ng; d√πng ti√™u ch√≠: l√† div flex + flex-wrap v√† c√≥ ch·ªØ 'm¬≤' ho·∫∑c 'tr/m¬≤'.
    """
    # 1) ·ª©ng vi√™n: m·ªçi div c√≥ flex + flex-wrap
    rows = card.find_elements(
        By.XPATH,
        ".//div[contains(@class,'flex') and contains(@class,'flex-wrap')]"
    )
    for r in rows:
        t = (r.text or "").strip()
        # h√†ng info lu√¥n c√≥ m¬≤ ho·∫∑c tr/m¬≤
        if "m¬≤" in t or "tr/m¬≤" in t:
            return r
    # fallback: tr·∫£ c·∫£ card (ƒë·ªÉ regex v·∫´n ch·∫°y ƒë∆∞·ª£c)
    return card

def get_price(card):
    # Gi√° m√†u ƒë·ªè/thay class theo theme ‚Üí th·ª≠ nhi·ªÅu selector
    price = text_or_na(card, By.CSS_SELECTOR, "div.text-error-600")
    if price == "N/A":
        price = text_or_na(card, By.CSS_SELECTOR, ".text-error, .text-danger, .text-red-600, .text-primary-600")
    return clean_spaces(price)

def get_price_per_m2(card):
    row = _info_row(card)

    # 1) ph·∫ßn t·ª≠ c√≥ text 'tr/m¬≤' trong row
    try:
        e = row.find_element(By.XPATH, ".//*[contains(normalize-space(.),'tr/m¬≤')]")
        val = (e.text or "").strip()
        if val:
            return val
    except:
        pass

    # 2) c√°c class ch·ªØ nh·ªè hay d√πng cho 'tr/m¬≤'
    for css in [".meey-caption-10r", ".text-fs-12", ".text-secondary-600", ".qmd\\:text-fs-12"]:
        try:
            for e in row.find_elements(By.CSS_SELECTOR, css):
                t = (e.text or "").strip()
                if "tr/m¬≤" in t:
                    return t
        except:
            pass

    # 3) regex fallback
    m = re.search(r"([\d.,]+\s*tr/m¬≤)", row.text, re.I)
    return m.group(1) if m else "N/A"

def get_area(card):
    # ∆Øu ti√™n selector tooltip n·∫øu c√≤n; fallback regex t·ª´ card.text
    area = text_or_na(card, By.CSS_SELECTOR, 'div[data-tippy-content="Di·ªán t√≠ch"] span')
    if area == "N/A":
        m = RE_AREA.search(card.text)
        area = m.group(1) + " m¬≤" if m else "N/A"
    return area

def get_bedrooms(card):
    # 1) Tooltip "S·ªë ph√≤ng ng·ªß" -> l·∫•y span; ∆∞u ti√™n attribute title
    try:
        e = card.find_element(By.CSS_SELECTOR, '[data-tippy-content*="S·ªë ph√≤ng ng·ªß"] span')
        val = (e.get_attribute("title") or e.text or "").strip()
        if val:
            return val
    except:
        pass

    # 2) Icon gi∆∞·ªùng -> node k·∫ø b√™n
    try:
        e = card.find_element(By.XPATH, ".//i[contains(@class,'bed')]/following-sibling::*[1]")
        val = (e.get_attribute("title") or e.text or "").strip()
        if val:
            return val
    except:
        pass

    # 3) Regex tr√™n to√†n card
    m = re.search(r"(\d+)\s*(?:pn|ph√≤ng ng·ªß)\b", card.text, re.I)
    return m.group(1) if m else "N/A"


def get_bathrooms(card):
    # 1) Tooltip "S·ªë ph√≤ng t·∫Øm"
    try:
        e = card.find_element(By.CSS_SELECTOR, '[data-tippy-content*="S·ªë ph√≤ng t·∫Øm"] span')
        val = (e.get_attribute("title") or e.text or "").strip()
        if val:
            return val
    except:
        pass

    # 2) Icon b·ªìn t·∫Øm -> node k·∫ø b√™n
    try:
        e = card.find_element(By.XPATH, ".//i[contains(@class,'bathtub') or contains(@class,'bath')]/following-sibling::*[1]")
        val = (e.get_attribute("title") or e.text or "").strip()
        if val:
            return val
    except:
        pass

    # 3) Regex tr√™n to√†n card
    m = re.search(r"(\d+)\s*(?:wc|ph√≤ng t·∫Øm|toilet)\b", card.text, re.I)
    return m.group(1) if m else "N/A"



def get_address(card):
    # ∆Øu ti√™n p c√≥ icon location
    try:
        e = card.find_element(By.XPATH, ".//p[i[contains(@class,'location') or contains(@class,'lml-location')]]")
        t = e.text.strip()
        if t: return clean_spaces(t)
    except: 
        pass
    # Fallback: l·∫•y d√≤ng CU·ªêI c√≥ 'H·ªì Ch√≠ Minh'
    lines = [clean_spaces(ln) for ln in card.text.split("\n") if ln.strip()]
    cand = None
    for ln in lines:
        if "H·ªì Ch√≠ Minh" in ln or "Tp. H·ªì Ch√≠ Minh" in ln or "TP. H·ªì Ch√≠ Minh" in ln:
            cand = ln
    return cand if cand else "N/A"

def get_posted_date(card):
    raw = "N/A"
    for xp in [
        ".//span[contains(.,'tr∆∞·ªõc')]",
        ".//span[contains(.,'H√¥m nay') or contains(.,'H√¥m qua')]",
        ".//*[self::span or self::p or self::div][contains(.,'tr∆∞·ªõc') or contains(.,'H√¥m nay') or contains(.,'H√¥m qua')]"
    ]:
        try:
            raw = card.find_element(By.XPATH, xp).text.strip()
            if raw: break
        except:
            pass
    return parse_date_label(raw)

def extract_row(card):
    return [
        get_title(card), get_price(card), get_area(card),
        get_price_per_m2(card), get_bedrooms(card), get_bathrooms(card),
        get_address(card), get_posted_date(card),
    ]

# ===== H√ÄM CH·∫†Y CH√çNH =====
def main():
    driver = None
    try:
        driver = init_driver()
        wait = WebDriverWait(driver, WAIT_TIMEOUT)
        ensure_csv_header(OUTPUT_FILE)
        page = read_checkpoint()
        total_processed = 0

        while page <= SAFETY_CAP:
            url = BASE.format(page=page)
            print(f"üöÄ ƒêang x·ª≠ l√Ω trang {page}: {url}")
            try:
                driver.get(url)
            except Exception as e:
                print(f"L·ªói khi t·∫£i URL: {e}. Th·ª≠ l·∫°i sau 5s...")
                time.sleep(5)
                continue

            cards = get_cards(driver, wait)
            if not cards:
                if "R·∫•t ti·∫øc, trang b·∫°n t√¨m ki·∫øm kh√¥ng t·ªìn t·∫°i" in driver.page_source:
                    print("‚Ü≥ Trang kh√¥ng t·ªìn t·∫°i. ƒê√£ ƒë·∫øn trang cu·ªëi c√πng.")
                else:
                    print("‚Ü≥ Kh√¥ng t√¨m th·∫•y b√†i ƒëƒÉng n√†o. C√≥ th·ªÉ ƒë√£ h·∫øt d·ªØ li·ªáu ho·∫∑c b·ªã ch·∫∑n.")
                break

            print(f"   => Ph√°t hi·ªán {len(cards)} b√†i ƒëƒÉng.")
            newly_added = 0
            with open(OUTPUT_FILE, "a", encoding="utf-8-sig", newline="") as f:
                w = csv.writer(f)
                for card in cards:
                    row = extract_row(card)
                    # ghi n·∫øu c√≥ Title ho·∫∑c Gi√°
                    if row[0] != "N/A" or row[1] != "N/A":
                        w.writerow(row)
                        newly_added += 1
                f.flush()
                try: os.fsync(f.fileno())
                except: pass

            total_processed += newly_added
            print(f"   ‚úÖ ƒê√£ ghi {newly_added} d√≤ng m·ªõi. T·ªïng s·ªë: {total_processed}")
            write_checkpoint(page)
            print(f"   üìù ƒê√£ l∆∞u checkpoint t·∫°i trang {page}.\n")
            page += 1
            time.sleep(rnd.uniform(2.0, 3.5))

        print(f"üéâ Ho√†n t·∫•t! T·ªïng c·ªông ƒë√£ ghi {total_processed} d√≤ng v√†o file {OUTPUT_FILE}")
    except KeyboardInterrupt:
        print("\n‚è∏  T·∫°m d·ª´ng b·ªüi ng∆∞·ªùi d√πng. Ch·∫°y l·∫°i script ƒë·ªÉ ti·∫øp t·ª•c t·ª´ checkpoint.")
    except Exception as e:
        print(f"‚ùå ƒê√£ x·∫£y ra l·ªói nghi√™m tr·ªçng: {e}")
    finally:
        if driver: safe_quit(driver)

if __name__ == "__main__":
    main()


üöÄ ƒêang x·ª≠ l√Ω trang 400: https://meeyland.com/mua-ban-nha-dat-ho-chi-minh-b43?page=400
‚Ü≥ Kh√¥ng t√¨m th·∫•y b√†i ƒëƒÉng n√†o. C√≥ th·ªÉ ƒë√£ h·∫øt d·ªØ li·ªáu ho·∫∑c b·ªã ch·∫∑n.
üéâ Ho√†n t·∫•t! T·ªïng c·ªông ƒë√£ ghi 0 d√≤ng v√†o file meeyland_raw.csv
