In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv

In [2]:
# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode
chrome_options.add_argument("--disable-gpu")  # Disable GPU acceleration
chrome_options.add_argument("--no-sandbox")  # Bypass OS security model
chrome_options.add_argument("--disable-dev-shm-usage")  # Overcome limited resource problems
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36")  # Add a user-agent string

# Crawl links

### Crawl links mua bán nhà ở from nhatot.com

In [None]:
# Initialize the array to store the links
arr = []
start_page = 1
end_page = 1000

# Retry logic
MAX_RETRIES = 3

for page in range(start_page, end_page + 1):
    for attempt in range(MAX_RETRIES):
        try:
            # Initialize WebDriver for each page
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
            url = f"https://www.nhatot.com/mua-ban-nha-dat?page={page}"
            driver.get(url)

            # Use explicit wait for better control
            WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CLASS_NAME, 'AdItem_adItem__gDDQT')))

            # Parse the page source with BeautifulSoup
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            # Find all the links with the specified class
            a_tags = soup.find_all("a", class_='AdItem_adItem__gDDQT')

            # Append the full link to the array
            for a in a_tags:
                full_link = "https://www.nhatot.com" + a['href']
                arr.append(full_link)

            driver.quit()
            break  # Break if the request was successful
        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            driver.quit()
            if attempt + 1 == MAX_RETRIES:
                print("Max retries reached. Skipping this page.")
            time.sleep(5)  # Wait before retrying

# Create a DataFrame from the array
df = pd.DataFrame(arr, columns=["Links"])

# Export the DataFrame to a CSV file
df.to_csv('muaBanNhaOLinks.csv', index=True)

print("done")

# Lấy chi tiết data

### Lấy chi tiết data mua bán nhà ở

In [None]:
from tenacity import retry, wait_fixed, stop_after_attempt

# Function to extract data from a single page
@retry(wait=wait_fixed(2), stop=stop_after_attempt(3))  # Wait 2 seconds between retries, retry 3 times
def extract_data(url):
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

    try:
        driver.get(url)

        # Wait until price element is available or timeout after 10 seconds
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'pyhk1dv')))

        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')

        try:
            price = soup.find("b", class_='pyhk1dv').text
        except AttributeError:
            price = None

        try:
            address = soup.find_all("span", class_="bwq0cbs")[4].text
        except (IndexError, AttributeError):
            address = None

        try:
            price_m2 = soup.find("span", itemprop="price_m2").text
        except AttributeError:
            price_m2 = None

        try:
            rooms = soup.find("span", itemprop="rooms").text
        except AttributeError:
            rooms = None

        try:
            toilets = soup.find("span", itemprop="toilets").text
        except AttributeError:
            toilets = None

        try:
            direction = soup.find("span", itemprop="direction").text
        except AttributeError:
            direction = None

        try:
            floors = soup.find("span", itemprop="floors").text
        except AttributeError:
            floors = None

        try:
            property_legal_document = soup.find("span", itemprop="property_legal_document").text
        except AttributeError:
            property_legal_document = None

        try:
            house_type = soup.find("span", itemprop="house_type").text
        except AttributeError:
            house_type = None

        try:
            furnishing_sell = soup.find("span", itemprop="furnishing_sell").text
        except AttributeError:
            furnishing_sell = None

        try:
            width = soup.find("span", itemprop="width").text
        except AttributeError:
            width = None

        try:
            length = soup.find("span", itemprop="length").text
        except AttributeError:
            length = None

        try:
            living_size = soup.find("span", itemprop="living_size").text
        except AttributeError:
            living_size = None

        try:
            size = soup.find("span", itemprop="size").text
        except AttributeError:
            size = None

        try:
            pty_characteristics = soup.find("span", itemprop="pty_characteristics").text
        except AttributeError:
            pty_characteristics = None
        
        driver.quit()

        return {
            "price": price,
            "address": address,
            "price_m2": price_m2,
            "rooms": rooms,
            "toilets": toilets,
            "direction": direction,
            "floors": floors,
            "property_legal_document": property_legal_document,
            "house_type": house_type,
            "furnishing_sell": furnishing_sell,
            "width": width,
            "length": length,
            "living_size": living_size,
            "size": size,
            "pty_characteristics": pty_characteristics,
        }
        
    except Exception as e:
        print(f"Error processing URL {url}: {e}")
        driver.quit()
        return None

# Function to save data to CSV
def save_to_csv(data_list, filename='muaBanNhaDat3.csv'):
    if not data_list:
        print("No data to save.")
        return
    keys = data_list[0].keys()  # Use the keys from the first dictionary as headers
    with open(filename, 'w', newline='', encoding='utf-8') as output_file:
        dict_writer = csv.DictWriter(output_file, fieldnames=keys)
        dict_writer.writeheader()
        dict_writer.writerows(data_list)

# Main function to process all URLs and save results
def main():
    # Replace this with your actual DataFrame or list of URLs
    frame = pd.read_csv('nhaDatLinks1.csv')  # Assuming you already have the 'Links' data in CSV
    all_data = []
    i = 0
    for url in frame['Links']:
        data = extract_data(url)
        i+=1
        if data:
            all_data.append(data)
        print(f"Processed URL {i}: {url}")

    save_to_csv(all_data)

if __name__ == "__main__":
    main()

In [None]:
frame1 = pd.read_csv('muaBanNhaDat.csv')
frame1.head()