In [2]:
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv

In [3]:
# Setup Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--disable-gpu")  # Disable GPU acceleration
chrome_options.add_argument("--no-sandbox")  # Bypass OS security model
chrome_options.add_argument("--disable-dev-shm-usage")  # Overcome limited resource problems
chrome_options.add_argument("--disable-blink-features=AutomationControlled")  # Bypass detection
chrome_options.add_argument("--window-size=1100,500")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36")  # User-agent string

# Disable automation controls so that websites don't detect Selenium
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)

# Crawl links

### Crawl links mua bán nhà ở from nhatot.com

In [4]:
# Initialize the array to store the links
arr = []
start_page = 1
end_page = 500

# Retry logic
MAX_RETRIES = 3

for page in range(start_page, end_page + 1):
    for attempt in range(MAX_RETRIES):
        try:
            # Initialize WebDriver for each page
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
            url = f"https://www.nhatot.com/mua-ban-nha-dat-tp-ho-chi-minh?page={page}"
            driver.get(url)

            # Use explicit wait for better control
            WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CLASS_NAME, 'AdItem_adItem__gDDQT')))

            # Parse the page source with BeautifulSoup
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            # Find all the links with the specified class
            a_tags = soup.find_all("a", class_='AdItem_adItem__gDDQT')

            # Append the full link to the array
            for a in a_tags:
                full_link = "https://www.nhatot.com" + a['href']
                arr.append(full_link)

            driver.quit()
            break  # Break if the request was successful
        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            driver.quit()
            if attempt + 1 == MAX_RETRIES:
                print("Max retries reached. Skipping this page.")
            time.sleep(5)  # Wait before retrying

# Create a DataFrame from the array
df = pd.DataFrame(arr, columns=["Links"])

# Export the DataFrame to a CSV file
df.to_csv('nhaOLinks.csv', index=True)

print("done")

Attempt 1 failed: Message: 
Stacktrace:
	GetHandleVerifier [0x011A83E3+25571]
	(No symbol) [0x0113A684]
	(No symbol) [0x01032113]
	(No symbol) [0x01076FB2]
	(No symbol) [0x010771FB]
	(No symbol) [0x010B7822]
	(No symbol) [0x0109AC54]
	(No symbol) [0x010B5349]
	(No symbol) [0x0109A9A6]
	(No symbol) [0x0106BAB6]
	(No symbol) [0x0106C50D]
	GetHandleVerifier [0x0147C4A3+2991267]
	GetHandleVerifier [0x014CD2C9+3322569]
	GetHandleVerifier [0x012384D2+615634]
	GetHandleVerifier [0x0123FBFC+646140]
	(No symbol) [0x0114327D]
	(No symbol) [0x01140188]
	(No symbol) [0x01140325]
	(No symbol) [0x01132826]
	BaseThreadInitThunk [0x76F9FCC9+25]
	RtlGetAppContainerNamedObjectPath [0x77E080CE+286]
	RtlGetAppContainerNamedObjectPath [0x77E0809E+238]

Attempt 2 failed: Could not reach host. Are you offline?
Attempt 3 failed: Could not reach host. Are you offline?
Max retries reached. Skipping this page.
Attempt 1 failed: Could not reach host. Are you offline?
Attempt 2 failed: Could not reach host. Are yo

# Lấy chi tiết data

### Lấy chi tiết data mua bán nhà ở

In [5]:
from tenacity import retry, wait_fixed, stop_after_attempt
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import StaleElementReferenceException, ElementClickInterceptedException

def click_load_more_button(driver):
    try:

        layer = WebDriverWait(driver, 5).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, "aw__s1cdo2zu"))
        )

        if layer:
            driver.execute_script("arguments[0].scrollIntoView(true);", layer[0])
            time.sleep(1)  # Add a small delay after scrolling
            layer[0].click()
            
        # Find all buttons with the class "styles_button__SVZnw"
        xem_them_buttons = WebDriverWait(driver, 5).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, "styles_button__SVZnw"))
        )

        property_details_div = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CLASS_NAME, "AdParam_adParamTitle__bU__w"))
        )
        
        # Ensure there's at least one button found
        if xem_them_buttons:
            driver.execute_script("arguments[0].scrollIntoView();", property_details_div)
            
            retry_count = 0
            while retry_count < 5:
                try:
                    # Give some time before clicking
                    time.sleep(2)
                    
                    # Click the first button
                    xem_them_buttons[0].click()
                    return True
                except (ElementClickInterceptedException, StaleElementReferenceException):
                    # Retry finding the buttons again in case of issues (without scrolling down)
                    xem_them_buttons = WebDriverWait(driver, 5).until(
                        EC.presence_of_all_elements_located((By.CLASS_NAME, "styles_button__SVZnw"))
                    )
                    retry_count += 1
            return False
        else:
            return False
    except TimeoutException:
        return False

# Function to extract data from a single page
@retry(wait=wait_fixed(2), stop=stop_after_attempt(10))  # Wait 2 seconds between retries, retry 3 times
def extract_data(url):
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    driver.set_window_position(-2000, 0)

    try:
        driver.get(url)

        load_more_success = click_load_more_button(driver)
        if load_more_success:
            # Wait for the page to load more data after clicking
            time.sleep(5)  
        
        # Wait until the price element is available or timeout after 10 seconds
        WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CLASS_NAME, 'pyhk1dv')))
        
        # Tiếp tục xử lý nội dung sau khi click vào nút "Xem thêm"
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        
        try:
            title = soup.find("title").text
        except AttributeError:
            title = None

        try:
            price = soup.find("b", class_='pyhk1dv').text
        except AttributeError:
            price = None

        try:
            address = soup.find("span", class_="flex-1").text
        except (IndexError, AttributeError):
            address = None

        try:
            price_m2 = soup.find("span", itemprop="price_m2").text
        except AttributeError:
            price_m2 = None

        try:
            rooms = soup.find("span", itemprop="rooms").text
        except AttributeError:
            rooms = None

        try:
            toilets = soup.find("span", itemprop="toilets").text
        except AttributeError:
            toilets = None

        try:
            direction = soup.find("span", itemprop="direction").text
        except AttributeError:
            direction = None

        try:
            floors = soup.find("span", itemprop="floors").text
        except AttributeError:
            floors = None

        try:
            property_legal_document = soup.find("span", itemprop="property_legal_document").text
        except AttributeError:
            property_legal_document = None

        try:
            house_type = soup.find("span", itemprop="house_type").text
        except AttributeError:
            house_type = None

        try:
            furnishing_sell = soup.find("span", itemprop="furnishing_sell").text
        except AttributeError:
            furnishing_sell = None

        try:
            width = soup.find("span", itemprop="width").text
        except AttributeError:
            width = None

        try:
            length = soup.find("span", itemprop="length").text
        except AttributeError:
            length = None

        try:
            living_size = soup.find("span", itemprop="living_size").text
        except AttributeError:
            living_size = None

        try:
            size = soup.find("span", itemprop="size").text
        except AttributeError:
            size = None

        try:
            pty_characteristics = soup.find("span", itemprop="pty_characteristics").text
        except AttributeError:
            pty_characteristics = None
        
        driver.quit()

        return {
            "title": title,
            "price": price,
            "address": address,
            "price_m2": price_m2,
            "rooms": rooms,
            "toilets": toilets,
            "direction": direction,
            "floors": floors,
            "property_legal_document": property_legal_document,
            "house_type": house_type,
            "furnishing_sell": furnishing_sell,
            "width": width,
            "length": length,
            "living_size": living_size,
            "size": size,
            "pty_characteristics": pty_characteristics,
        }
        
    except Exception as e:
        print(f"Error processing URL {url}: {e}")
        driver.quit()
        return None
    finally:
        driver.quit() 

# Function to save data to CSV
def save_to_csv(data_list, filename='muaBanNhaDat1.csv'):
    if not data_list:
        print("No data to save.")
        return
    keys = data_list[0].keys()  # Use the keys from the first dictionary as headers
    with open(filename, 'w', newline='', encoding='utf-8') as output_file:
        dict_writer = csv.DictWriter(output_file, fieldnames=keys)
        dict_writer.writeheader()
        dict_writer.writerows(data_list)

# Main function to process all URLs and save results
def main():
    # Replace this with your actual DataFrame or list of URLs
    frame = pd.read_csv('nhaOLinks - Copy.csv')  # Assuming you already have the 'Links' data in CSV
    all_data = []
    i = 0
    for url in frame['Links']:
        data = extract_data(url)
        i+=1
        if data:
            all_data.append(data)
        print(f"Processed URL {i}: {url}")

    save_to_csv(all_data)

if __name__ == "__main__":
    main()

Processed URL 1: https://www.nhatot.com/mua-ban-nha-dat-quan-tan-binh-tp-ho-chi-minh/119601468.htm
Processed URL 2: https://www.nhatot.com/mua-ban-nha-dat-quan-go-vap-tp-ho-chi-minh/118739899.htm
Processed URL 3: https://www.nhatot.com/mua-ban-nha-dat-thanh-pho-thu-duc-tp-ho-chi-minh/119588307.htm
Processed URL 4: https://www.nhatot.com/mua-ban-nha-dat-quan-12-tp-ho-chi-minh/112776242.htm
Processed URL 5: https://www.nhatot.com/mua-ban-nha-dat-quan-12-tp-ho-chi-minh/119601447.htm#px=SR-special_display_ad-[PO-19][PL-default]
Processed URL 6: https://www.nhatot.com/mua-ban-nha-dat-quan-12-tp-ho-chi-minh/119310623.htm
Processed URL 7: https://www.nhatot.com/mua-ban-nha-dat-quan-12-tp-ho-chi-minh/119310623.htm
Processed URL 8: https://www.nhatot.com/mua-ban-nha-dat-quan-tan-phu-tp-ho-chi-minh/119601411.htm
Processed URL 9: https://www.nhatot.com/mua-ban-nha-dat-quan-12-tp-ho-chi-minh/119448559.htm#px=SR-special_display_ad-[PO-3][PL-default]
Processed URL 10: https://www.nhatot.com/mua-ban-

In [None]:
frame1 = pd.read_csv('muaBanNhaDat.csv')
frame1.head()