In [1]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import concurrent.futures
import time

In [2]:
# Max 1000 property links per suburb

def get_property_links(suburb, postcode, max_pages=50):
    base_url = f'https://www.domain.com.au/rent/{suburb}-vic-{postcode}/?sort=price-desc&page='
    property_links = []

    for page in range(1, max_pages + 1):
        url = base_url + str(page)

        response = requests.get(url, headers={'User-Agent': 'PostmanRuntime/7.6.0'})

        if response.status_code == 403:
            print(f"RATE_LIMITING: Failed to retrieve page {page}, {suburb}, {postcode}. Status code: {response.status_code}. Exiting loop.")
            break

        if response.status_code == 404:
            print(f"SUBURB_NOT_FOUND: Failed to retrieve page {page}, {suburb}, {postcode} likely due to incorrect suburb name. Status code: {response.status_code}. Exiting loop.")
            break
        
        # Check if the response status code is not 200
        elif response.status_code != 200:
            print(f"Failed to retrieve page {page}, {suburb}, {postcode}. Status code: {response.status_code}. Exiting loop.")
            break
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        listings = soup.find_all('a', class_='address')

        if not listings:
            print(f"NO LISTINGS: No listings found on page {page}, {suburb}, {postcode}. Exiting loop.")
            break

        for listing in listings:
            link = listing.get('href')
            if link:
                property_links.append(link)

        time.sleep(1)
    
    return property_links

In [3]:
def collect_all_property_links(postcode_mapping):
    all_property_links = []
    
    for postcode, suburbs in postcode_mapping.items():
        for suburb in suburbs:
            property_links = get_property_links(suburb, postcode)
            # You can choose to store suburb and postcode with each link if needed
            all_property_links.append(property_links)
    
    return all_property_links

In [4]:
def extract_property_details(soup):
    # Locate the JSON data embedded in the HTML
    script_data = soup.find('script', id='__NEXT_DATA__').string
    json_data = json.loads(script_data)
    
    # Extract necessary details
    layout_props = json_data['props']['pageProps']['layoutProps']
    property_details = layout_props["digitalData"]["page"]["pageInfo"]["property"]
    component_props = json_data['props']['pageProps']['componentProps']
    
    data = {
        'title': layout_props.get('title'),
        'description': layout_props.get('description'),
        'street_adress': property_details.get('address'),
        'suburb': property_details.get('suburb'),
        'postcode': property_details.get('postcode'),
        'price': property_details.get('price'),
        'bedrooms': property_details.get('bedrooms'),
        'bathrooms': property_details.get('bathrooms'),
        'parking': property_details.get('parking'),
        'primary_property_type': property_details.get('primaryPropertyType'),
        'property_features': property_details.get('propertyFeatures'),
        'structured_features': property_details.get('structuredFeatures', []),
        'video_count': property_details.get('videoCount'),
        'photo_count': property_details.get('photoCount'),
        'date_listed': property_details.get('dateListed'),
        'days_listed': property_details.get('daysListed'),
        'floor_plans_count': property_details.get('floorPlansCount'),
        'virtual_tour': property_details.get('virtualTour'),
        'nbn_details': layout_props.get('nbnDetails'),
        'nearby_schools': component_props.get('schoolCatchment', {}).get('schools', [])
    }
    
    return data

In [5]:
def scrape_properties(property_links):
    # Create an empty list to store the property details
    all_properties = []

    for url in property_links:
        # Fetch the page content
        response = requests.get(url, headers={'User-Agent': 'PostmanRuntime/7.6.0'})
        
        if response.status_code == 403:
            print(f"RATE_LIMITING: Failed to retrieve page {url}. Status code: {response.status_code}. Exiting loop.")
            break

        # Check if the response status code is not 200
        elif response.status_code != 200:
            print(f"Failed to retrieve page {url}. Status code: {response.status_code}. Exiting loop.")
            break
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract the property details
        property_data = extract_property_details(soup)
        all_properties.append(property_data)

        time.sleep(0.5)
    
    # Create a DataFrame from the list of dictionaries
    df = pd.DataFrame(all_properties)
    return df

In [6]:
# Load the postcode mapping from the JSON file
with open('../../data/landing/postcode_mapping.json', 'r') as json_file:
    postcode_mapping = json.load(json_file)

In [7]:
def collect_all_property_links_threaded(postcode_mapping, max_workers=5):
    all_property_links = []
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        for postcode, suburbs in postcode_mapping.items():
            for suburb in suburbs:
                futures.append(executor.submit(get_property_links, suburb, postcode))
                time.sleep(0.5)

        for future in concurrent.futures.as_completed(futures):
            all_property_links.extend(future.result())
    
    return all_property_links

In [8]:
def scrape_properties_threaded(property_links, max_workers=5):
    all_properties = []
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        for url in property_links:
            futures.append(executor.submit(scrape_properties, [url]))
            time.sleep(0.5)
        
        for future in concurrent.futures.as_completed(futures):
            all_properties.append(future.result())
    
    return pd.concat(all_properties)

In [None]:
# Warning: This will take a long time to run (hours)
# Status codes 404 are common because of duplicate suburb names, so don't worry about them
# Status codes 403 are rate limiting, so you may need to slow down your requests

property_links = collect_all_property_links_threaded(postcode_mapping, 50)

In [10]:
# Convert to DataFrame and save to CSV
links_df = pd.DataFrame(property_links, columns=['property_link'])
links_df.drop_duplicates(inplace=True)
links_df.to_parquet('../../data/raw/property_links.parquet', index=False)

In [11]:
links_df = pd.read_parquet('../../data/raw/property_links.parquet')
links_df.drop_duplicates(inplace=True)

In [12]:
property_df = scrape_properties_threaded(links_df['property_link'], 100)

KeyboardInterrupt: 

In [None]:
property_df.to_parquet(f'../../data/raw/property_details.parquet', index=False)