In [1]:
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from seleniumbase import Driver
import csv
import time
import json

In [2]:
def initialize_driver(pagenum):
    driver = Driver(uc=True)
    driver.get(f"https://www.rumah123.com/jual/tangerang/residensial/?page={pagenum}")
    return driver

In [3]:
def scrape_links_from_page(driver):
    time.sleep(5)
    my_page = driver.page_source
    my_html = BeautifulSoup(my_page, "html.parser")
    card_containers = my_html.find_all('div', class_='card-featured__middle-section')

    nav_links = []
    for container in card_containers:
        property_link = container.find('a', href=True)
        href = property_link['href'] if property_link else "unknown"
        nav_links.append(f'https://www.rumah123.com{href}')
    
    return nav_links

In [4]:
def scrape_property_details(driver, link):
    driver.get(link)
    time.sleep(5)  
    details = {'url': link}
    
    try:
        header_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'ui-container.ui-property-page__main-container'))
        )
        soup_header = BeautifulSoup(header_element.get_attribute('innerHTML'), 'html.parser')

        # Extract data
        price_text = soup_header.select_one('.r123-listing-summary__price').text
        try:
            currency, price, price_unit = price_text.split()
        except ValueError:
            currency, price, price_unit = "unknown", "unknown", "unknown"
        
        title = soup_header.select_one('.r123-listing-summary__header-container-title').text.strip()
        address = soup_header.select_one('.r123-listing-summary__header-container-address').text.strip()
        
        # Compile header information
        details.update({
            'title': title,
            'price_currency': currency,
            'price_value': price,
            'price_unit': price_unit,
            'address': address
        })

    except AttributeError as e:
        print(f"An error occurred: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

    try:
        # Scrape the specifications
        specifications_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'listing-specification-v2__content'))
        )
        soup_specifications = BeautifulSoup(specifications_element.get_attribute('innerHTML'), 'html.parser')

        items = soup_specifications.select('.listing-specification-v2__item')
        for item in items:
            label = item.find(class_='listing-specification-v2__item-label').text.strip()
            value = item.find(class_='listing-specification-v2__item-value').text.strip()
            details[label] = value

    except AttributeError as e:
        print(f"An error occurred: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

    try:
        # Extract lat and lon from the script tag
        script_tag = driver.find_element(By.ID, '99-root-props')
        script_content = script_tag.get_attribute('innerHTML')
        script_json = json.loads(script_content)
        
        location = script_json.get('property', {}).get('data', {}).get('gallery', {}).get('map', {}).get('location', {})
        lat = location.get('lat', 'unknown')
        lon = location.get('lng', 'unknown')
        
        details.update({
            'latitude': lat,
            'longitude': lon
        })
    except Exception as e:
        print(f"An error occurred while extracting lat/lon: {e}")

    return details

In [6]:
# Initialize the WebDriver and scrape links from multiple pages
all_nav_links = []
for page_num in range(21, 22):  # Adjust the range for the number of pages to scrape
    driver = initialize_driver(page_num)
    page_links = scrape_links_from_page(driver)
    all_nav_links.extend(page_links)
    driver.quit()

# Scrape property details from each link
all_data = []
for link in all_nav_links:
    driver = Driver(uc=True)
    details = scrape_property_details(driver, link)
    all_data.append(details)
    driver.quit()

# Determine the header for the CSV file (all unique keys from the scraped data)
header = set()
for data in all_data:
    header.update(data.keys())

header = sorted(header)

# Write data to CSV file
with open('/Users/salmadanu/Desktop/Bangkit/HouseSpot/HouseSpot-ML/Rumah123_Scrapping/CSV_Files/tangerang-2.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=header)
    writer.writeheader()
    for data in all_data:
        writer.writerow(data)

print("Data has been written to tangerang-2.csv")



*** chromedriver to download = 125.0.6422.141 (Latest Stable) 

Downloading chromedriver-mac-x64.zip from:
https://storage.googleapis.com/chrome-for-testing-public/125.0.6422.141/mac-x64/chromedriver-mac-x64.zip ...
Download Complete!

Extracting ['chromedriver'] from chromedriver-mac-x64.zip ...
Unzip Complete!

The file [uc_driver] was saved to:
/Users/salmadanu/Desktop/Bangkit/HouseSpot/HouseSpot-ML/.venv/lib/python3.9/site-packages/seleniumbase/drivers/uc_driver

Making [uc_driver 125.0.6422.141] executable ...
[uc_driver 125.0.6422.141] is now ready for use!

Data has been written to tangerang-2.csv
