In [10]:
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv

In [11]:
# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
chrome_options.add_argument("--disable-gpu")  # Disable GPU acceleration
chrome_options.add_argument("--no-sandbox")  # Bypass OS security model
chrome_options.add_argument("--disable-dev-shm-usage")  # Overcome limited resource problems
chrome_options.add_argument("--disable-blink-features=AutomationControlled")  # Bypass detection
chrome_options.add_argument("--window-size=1,1")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36")  # User-agent string

# Disable automation controls so that websites don't detect Selenium
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)

# Crawl links

### Crawl links mua bán căn hộ chung cư from nhatot.com

In [3]:
# Initialize the array to store the links
arr = []
start_page = 1
end_page = 1

# Retry logic
MAX_RETRIES = 3

for page in range(start_page, end_page + 1):
    for attempt in range(MAX_RETRIES):
        try:
            # Initialize WebDriver for each page
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
            url = f"https://www.nhatot.com/mua-ban-can-ho-chung-cu-tp-ho-chi-minh?page={page}"
            driver.get(url)

            # Use explicit wait for better control
            WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CLASS_NAME, 'AdItem_adItem__gDDQT')))

            # Parse the page source with BeautifulSoup
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            # Find all the links with the specified class
            a_tags = soup.find_all("a", class_='AdItem_adItem__gDDQT')

            # Append the full link to the array
            for a in a_tags:
                full_link = "https://www.nhatot.com" + a['href']
                arr.append(full_link)

            driver.quit()
            break  # Break if the request was successful
        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            driver.quit()
            if attempt + 1 == MAX_RETRIES:
                print("Max retries reached. Skipping this page.")
            time.sleep(5)  # Wait before retrying

# Create a DataFrame from the array
df = pd.DataFrame(arr, columns=["Links"])

# Export the DataFrame to a CSV file
df.to_csv('canHoLinks..csv', index=True)

print("done")

done


# Lấy chi tiết data

### Lấy chi tiết data mua bán căn hộ chung cư

In [12]:
def extract_data(url):
    # Initialize WebDriver with Chrome options
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

    try:
        driver.get(url)

        # Wait for page to load and elements to be available
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'cd9gm5n')))

        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        
        try:
            title = soup.find("title").text
        except AttributeError:
            title = None

        try:
            price = soup.find("b", class_='pyhk1dv').text
        except AttributeError:
            price = None

        try:
            address = soup.find("span", class_="flex-1").text
        except (IndexError, AttributeError):
            address = None

        try:
            price_m2 = soup.find("span", itemprop="price_m2").text
        except AttributeError:
            price_m2 = None

        try:
            rooms = soup.find("span", itemprop="rooms").text
        except AttributeError:
            rooms = None

        try:
            toilets = soup.find("span", itemprop="toilets").text
        except AttributeError:
            toilets = None

        try:
            direction = soup.find("span", itemprop="direction").text
        except AttributeError:
            direction = None

        try:
            property_status = soup.find("span", itemprop="property_status").text
        except AttributeError:
            property_status = None
            
        try:
            balconydirection = soup.find("span", itemprop="balconydirection").text
        except AttributeError:
            balconydirection = None
            
        try:
            property_legal_document = soup.find("span", itemprop="property_legal_document").text
        except AttributeError:
            property_legal_document = None

        try:
            apartment_type = soup.find("span", itemprop="apartment_type").text
        except AttributeError:
            apartment_type = None

        try:
            furnishing_sell = soup.find("span", itemprop="furnishing_sell").text
        except AttributeError:
            furnishing_sell = None

        try:
            size = soup.find("span", itemprop="size").text
        except AttributeError:
            size = None
            
        try:
            apartment_feature = soup.find("span", itemprop="apartment_feature").text
        except AttributeError:
            apartment_feature = None
        
        driver.quit()

        return {
            "title": title,
            "price": price,
            "address": address,
            "price_m2": price_m2,
            "rooms": rooms,
            "toilets": toilets,
            "direction": direction,
            "property_status": property_status,
            "balconydirection": balconydirection,
            "property_legal_document": property_legal_document,
            "apartment_type": apartment_type,
            "furnishing_sell": furnishing_sell,
            "size": size,
            "apartment_feature": apartment_feature
        }
        
    except Exception as e:
        print(f"Error processing URL {url}: {e}")
        driver.quit()
        return None

# Function to save data to CSV
def save_to_csv(data_list, filename='muaBanCanHo.csv'):
    if not data_list:
        print("No data to save.")
        return
    keys = data_list[0].keys()  # Use the keys from the first dictionary as headers
    with open(filename, 'w', newline='', encoding='utf-8') as output_file:
        dict_writer = csv.DictWriter(output_file, fieldnames=keys)
        dict_writer.writeheader()
        dict_writer.writerows(data_list)

# Main function to process all URLs and save results
def main():
    # Replace this with your actual DataFrame or list of URLs
    frame = pd.read_csv('canHoLinks.csv')  # Assuming you already have the 'Links' data in CSV
    all_data = []
    i = 0
    for url in frame['Links']:
        data = extract_data(url)
        i+=1
        if data:
            all_data.append(data)
        print(f"Processed URL {i}: {url}")

    save_to_csv(all_data)

if __name__ == "__main__":
    main()

Processed URL 1: https://www.nhatot.com/mua-ban-can-ho-chung-cu-thanh-pho-thu-duc-tp-ho-chi-minh/119555664.htm#px=SR-stickyad-[PO-1][PL-top]
Processed URL 2: https://www.nhatot.com/mua-ban-can-ho-chung-cu-quan-7-tp-ho-chi-minh/119593013.htm#px=SR-stickyad-[PO-2][PL-top]
Processed URL 3: https://www.nhatot.com/mua-ban-can-ho-chung-cu-thanh-pho-thu-duc-tp-ho-chi-minh/119269431.htm#px=SR-stickyad-[PO-3][PL-top]
Processed URL 4: https://www.nhatot.com/mua-ban-can-ho-chung-cu-quan-8-tp-ho-chi-minh/118270241.htm
Processed URL 5: https://www.nhatot.com/mua-ban-can-ho-chung-cu-huyen-binh-chanh-tp-ho-chi-minh/119663108.htm#px=SR-special_display_ad-[PO-5][PL-default]
Processed URL 6: https://www.nhatot.com/mua-ban-can-ho-chung-cu-quan-11-tp-ho-chi-minh/119662996.htm


In [None]:
frame1 = pd.read_csv('muaBanCanHo.csv')
frame1.head()