In [7]:
import re
import time
import random
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from concurrent.futures import ThreadPoolExecutor
from threading import Lock

class NhatotMasterCrawler:
    def __init__(self, headless=False):
        self.base_url = "https://www.nhatot.com/tham-khao-gia-mua-ban-nha-dat-{}-tp-ho-chi-minh"
        self.headless = headless
        self.print_lock = Lock()

    def _init_driver(self):
        chrome_options = Options()
        if self.headless:
            chrome_options.add_argument('--headless=new')

        chrome_options.add_argument('--disable-blink-features=AutomationControlled')
        chrome_options.add_argument('--window-size=1366,768')
        # Th√™m c√°c option gi√∫p tr√°nh b·ªã block v√† render t·ªët h∆°n
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)

        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=chrome_options)
        driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
        return driver

    def extract_numbers(self, text: str):
        # Lo·∫°i b·ªè d·∫•u ch·∫•m ngƒÉn c√°ch h√†ng ngh√¨n, ƒë·ªïi d·∫•u ph·∫©y th√†nh d·∫•u ch·∫•m th·∫≠p ph√¢n
        cleaned = text.replace('.', '').replace(',', '.')
        matches = re.findall(r"[-+]?\d*\.\d+|\d+", cleaned)
        return [float(m) for m in matches]

    def get_data_from_section(self, section_soup):
        price_values = []
        elements = section_soup.find_all(['div', 'span', 'text'])

        for el in elements:
            txt = el.get_text(strip=True)
            # Regex ki·ªÉm tra chu·ªói ch·ªâ ch·ª©a s·ªë v√† k√Ω t·ª± ph√¢n t√°ch
            if re.match(r'^[\d.,]+$', txt):
                val = self.extract_numbers(txt)
                if val: price_values.append(val[0])

        # L·ªçc s·ªë trong kho·∫£ng gi√° nh√† ƒë·∫•t h·ª£p l√Ω
        valid_prices = sorted(list(set([p for p in price_values if 10 < p < 3000])))

        avg, low, high = None, None, None
        if len(valid_prices) >= 3:
            low = valid_prices[0] # Gi√° th·∫•p nh·∫•t b√™n tr√°i
            high = valid_prices[-1] # Gi√° cao nh·∫•t b√™n ph·∫£i
            # L·∫•y s·ªë n·∫±m gi·ªØa (gi√° trung b√¨nh l∆° l·ª≠ng tr√™n bar)
            remaining = [p for p in valid_prices if p != low and p != high]
            avg = remaining[0] if remaining else (low + high) / 2
        elif len(valid_prices) == 1:
            avg = valid_prices[0]

        return avg, low, high

    def crawl_single_district(self, district_tuple):
        district_name, slug = district_tuple
        driver = self._init_driver()
        results = []

        try:
            url = self.base_url.format(slug)
            driver.get(url)

            # ƒê·ª£i trang t·∫£i s∆° b·ªô
            WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.TAG_NAME, "body")))

            # Cu·ªôn trang t·ª´ t·ª´ ƒë·ªÉ bi·ªÉu ƒë·ªì load (quan tr·ªçng)
            for i in range(3):
                driver.execute_script(f"window.scrollTo(0, {300 * (i+1)});")
                time.sleep(1.5)

            # ƒê·ª£i c·ª• th·ªÉ m·ªôt con s·ªë gi√° xu·∫•t hi·ªán tr√™n chart
            time.sleep(5)

            soup = BeautifulSoup(driver.page_source, 'html.parser')

            sections = [
                {"title": "Gi√° Nh√† ·ªü", "type": "nha_o", "hem": 0, "mt": 0},
                {"title": "Gi√° Nh√† m·∫∑t ph·ªë", "type": "nha_mat_pho", "hem": 0, "mt": 1},
                {"title": "Gi√° Nh√† ng√µ, h·∫ªm", "type": "nha_ng√µ_h·∫ªm", "hem": 1, "mt": 0}
            ]

            for sec in sections:
                # T√¨m header ch·ª©a ti√™u ƒë·ªÅ lo·∫°i nh√†
                header = soup.find(['h2', 'h3', 'h4'], string=re.compile(sec["title"], re.I))
                if header:
                    container = header.find_parent('div')
                    avg, low, high = self.get_data_from_section(container)

                    if avg or low: # Ch·ªâ add n·∫øu l·∫•y ƒë∆∞·ª£c d·ªØ li·ªáu
                        results.append({
                            "district": district_name,
                            "type_house": sec["type"],
                            "is_hem": sec["hem"],
                            "is_mattien": sec["mt"],
                            "price_per_m2_avg": avg,
                            "price_per_m2_low_avg": low,
                            "price_per_m2_high_avg": high
                        })

            with self.print_lock:
                status = "‚úÖ" if results else "‚ö†Ô∏è Kh√¥ng l·∫•y ƒë∆∞·ª£c data"
                print(f"{status}: {district_name}")

        except Exception as e:
            with self.print_lock:
                print(f"‚ùå L·ªói t·∫°i {district_name}: {str(e)}")
        finally:
            driver.quit()

        return results

    def run(self, districts_dict, workers=2): # Gi·∫£m worker xu·ªëng 2 ƒë·ªÉ tr√°nh lag m√°y khi m·ªü Chrome
        print(f"üöÄ B·∫Øt ƒë·∫ßu qu√©t ƒëa lu·ªìng ({workers} lu·ªìng - HI·ªÜN TR√åNH DUY·ªÜT)...")
        all_data = []

        with ThreadPoolExecutor(max_workers=workers) as executor:
            future_results = list(executor.map(self.crawl_single_district, districts_dict.items()))

            for res in future_results:
                if res:
                    all_data.extend(res)

        return pd.DataFrame(all_data)

if __name__ == "__main__":
    districts = {
        "Qu·∫≠n 1": "quan-1",
        "Qu·∫≠n 3": "quan-3",
        "Th√†nh ph·ªë Th·ªß ƒê·ª©c": "thanh-pho-thu-duc",
        "Qu·∫≠n 4": "quan-4",
        "Qu·∫≠n 5": "quan-5",
        "Qu·∫≠n 6": "quan-6",
        "Qu·∫≠n 7": "quan-7",
        "Qu·∫≠n 8": "quan-8",
        "Qu·∫≠n 10": "quan-10",
        "Qu·∫≠n 11": "quan-11",
        "Qu·∫≠n 12": "quan-12",
        "Qu·∫≠n B√¨nh T√¢n": "quan-binh-tan",
        "Qu·∫≠n B√¨nh Th·∫°nh": "quan-binh-thanh",
        "Qu·∫≠n G√≤ V·∫•p": "quan-go-vap",
        "Qu·∫≠n Ph√∫ Nhu·∫≠n": "quan-phu-nhuan",
        "Qu·∫≠n T√¢n B√¨nh": "quan-tan-binh",
        "Qu·∫≠n T√¢n Ph√∫": "quan-tan-phu",
        "Huy·ªán B√¨nh Ch√°nh": "huyen-binh-chanh",
        "Huy·ªán C·∫ßn Gi·ªù": "huyen-can-gio",
        "Huy·ªán C·ªß Chi": "huyen-cu-chi",
        "Huy·ªán H√≥c M√¥n": "huyen-hoc-mon"
    }

    # T√¥i ƒë·ªÉ m·∫∑c ƒë·ªãnh headless=False ƒë·ªÉ b·∫°n th·∫•y Chrome ch·∫°y.
    # workers=2 ƒë·ªÉ m√°y kh√¥ng b·ªã qu√° t·∫£i khi m·ªü nhi·ªÅu Chrome c√πng l√∫c.
    crawler = NhatotMasterCrawler(headless=False)
    df_final = crawler.run(districts, workers=2)

    if not df_final.empty:
        print("\n" + "="*50)
        print("D·ªÆ LI·ªÜU ƒê√É THU TH·∫¨P")
        print("="*50)
        print(df_final.to_string(index=False))
        df_final.to_csv("Bench.csv", index=False, encoding="utf-8-sig")
        print(f"\nüíæ L∆∞u th√†nh c√¥ng {len(df_final)} d√≤ng.")
    else:
        print("\n‚ùå V·∫´n kh√¥ng l·∫•y ƒë∆∞·ª£c d·ªØ li·ªáu. H√£y ki·ªÉm tra k·∫øt n·ªëi m·∫°ng ho·∫∑c th·ª≠ tƒÉng time.sleep.")

üöÄ B·∫Øt ƒë·∫ßu qu√©t ƒëa lu·ªìng (2 lu·ªìng - HI·ªÜN TR√åNH DUY·ªÜT)...
‚úÖ: Qu·∫≠n 3
‚úÖ: Qu·∫≠n 1
‚úÖ: Th√†nh ph·ªë Th·ªß ƒê·ª©c
‚úÖ: Qu·∫≠n 4
‚úÖ: Qu·∫≠n 5
‚úÖ: Qu·∫≠n 6
‚úÖ: Qu·∫≠n 7
‚úÖ: Qu·∫≠n 8
‚úÖ: Qu·∫≠n 10
‚úÖ: Qu·∫≠n 11
‚úÖ: Qu·∫≠n 12
‚úÖ: Qu·∫≠n B√¨nh T√¢n
‚úÖ: Qu·∫≠n B√¨nh Th·∫°nh
‚úÖ: Qu·∫≠n G√≤ V·∫•p
‚úÖ: Qu·∫≠n T√¢n B√¨nh
‚úÖ: Qu·∫≠n Ph√∫ Nhu·∫≠n
‚úÖ: Qu·∫≠n T√¢n Ph√∫
‚úÖ: Huy·ªán B√¨nh Ch√°nh
‚úÖ: Huy·ªán C·∫ßn Gi·ªù
‚úÖ: Huy·ªán C·ªß Chi
‚úÖ: Huy·ªán H√≥c M√¥n

D·ªÆ LI·ªÜU ƒê√É THU TH·∫¨P
         district  type_house  is_hem  is_mattien  price_per_m2_avg  price_per_m2_low_avg  price_per_m2_high_avg
           Qu·∫≠n 1       nha_o       0           0            366.67                261.38                 727.27
           Qu·∫≠n 1 nha_mat_pho       0           1            450.82                347.22                 882.35
           Qu·∫≠n 1 nha_ng√µ_h·∫ªm       1           0            230.00                185.71                 320.00
           Qu·∫≠n 3     