### This file is dedicated to scrape and download data from domain.com.au

Yuecheng Wang Aug 30, modify from ./scripts/scrape.py

In [3]:
import os
import re
from tqdm import tqdm
from collections import defaultdict
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import pandas as pd
import requests

In [4]:
# Constants
BASE_URL = "https://www.domain.com.au"

In [5]:
# Save scraped data directly to CSV files

property_metadata = []
# Grab property data via post code
postcodes = [list(range(3000,4000)), list(range(8000, 9000))]


for postcode in postcodes:
    # maximum 2 page for each post code
    N_PAGES = range(1, 3)
    url_links = []
    # Generate list of URLs to visit
    for page in N_PAGES:
        
        url = BASE_URL + f"/rent/?postcode={postcode}&sort=dateupdated-desc&page={page}"
        print(f"Visiting {url}")
        try:
            bs_object = BeautifulSoup(urlopen(Request(url, headers={'User-Agent': "PostmanRuntime/7.6.0"})), "html.parser")
        except Exception as e:
            print(f"Failed to load page {url}: {e}")
            continue
    
        # Check if the property listing section exists
        result_section = bs_object.find("ul", {"data-testid": "results"})
        if not result_section:
            print(f"No results found on page {url}")
            continue
        
        # Find property listing links
        index_links = result_section.findAll("a", href=re.compile(f"{BASE_URL}/*"))
    
        for link in index_links:
            # Check if link has 'class' attribute and contains 'address'
            if link.get('class') and 'address' in link['class']:
                url_links.append(link['href'])
    
    # For each URL, scrape the property metadata
    pbar = tqdm(url_links)
    success_count, total_count = 0, 0
    for property_url in pbar:
        try:
            bs_object = BeautifulSoup(urlopen(Request(property_url, headers={'User-Agent': "PostmanRuntime/7.6.0"})), "html.parser")
        except Exception as e:
            print(f"Failed to load property page {property_url}: {e}")
            continue
        
        total_count += 1
        
        try: 
            # Extract property details
            address = bs_object.find("h1", {"class": "css-164r41r"})
            address = address.text if address else "N/A"

            cost_text = bs_object.find("div", {"data-testid": "listing-details__summary-title"})
            cost_text = cost_text.text if cost_text else "N/A"
            
            # Extract rooms and parking details
            rooms_section = bs_object.find("div", {"data-testid": "property-features"})
            if rooms_section:
                rooms = rooms_section.findAll("span", {"data-testid": "property-features-text-container"})
                room_details = ', '.join([re.findall(r'\d+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'Bed' in feature.text or 'Bath' in feature.text])
                parking_details = ', '.join([re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'Parking' in feature.text])
            else:
                room_details = "N/A"
                parking_details = "N/A"

            # Attempt to find the link that contains the coordinates
            try:
                href = bs_object.find("a", {"target": "_blank", 'rel': "noopener noreferrer"})
                href = href.attrs['href'] if href else ""
                # Extract coordinates from the URL using regex
                coordinates = [float(coord) for coord in re.findall(r'destination=([-\s,\d\.]+)', href)[0].split(',')] if href else [None, None]
            except (AttributeError, IndexError, ValueError):
                coordinates = [None, None]

            # Append the flattened property data to the list
            property_metadata.append({
                'Address': address,
                'Cost': cost_text,
                'Rooms': room_details,
                'Parking': parking_details,
                'Coordinates': coordinates,
                'Postcode': postcode
            })
            success_count += 1
            
        except AttributeError as e:
            print(f"Issue with {property_url}: {e}")
    
        pbar.set_description(f"{(success_count / total_count * 100):.0f}% successful")

# Save to CSV
output_relative_dir = '../../data/raw/domain/'
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)

csv_file_path = f'{output_relative_dir}all_postcode.csv'
df = pd.DataFrame(property_metadata)
df.to_csv(csv_file_path, index=False)
print(f"Data saved to {csv_file_path}")


Visiting https://www.domain.com.au/rent/?postcode=[3000, 3001, 3002, 3003, 3004, 3005, 3006, 3007, 3008, 3009, 3010, 3011, 3012, 3013, 3014, 3015, 3016, 3017, 3018, 3019, 3020, 3021, 3022, 3023, 3024, 3025, 3026, 3027, 3028, 3029, 3030, 3031, 3032, 3033, 3034, 3035, 3036, 3037, 3038, 3039, 3040, 3041, 3042, 3043, 3044, 3045, 3046, 3047, 3048, 3049, 3050, 3051, 3052, 3053, 3054, 3055, 3056, 3057, 3058, 3059, 3060, 3061, 3062, 3063, 3064, 3065, 3066, 3067, 3068, 3069, 3070, 3071, 3072, 3073, 3074, 3075, 3076, 3077, 3078, 3079, 3080, 3081, 3082, 3083, 3084, 3085, 3086, 3087, 3088, 3089, 3090, 3091, 3092, 3093, 3094, 3095, 3096, 3097, 3098, 3099, 3100, 3101, 3102, 3103, 3104, 3105, 3106, 3107, 3108, 3109, 3110, 3111, 3112, 3113, 3114, 3115, 3116, 3117, 3118, 3119, 3120, 3121, 3122, 3123, 3124, 3125, 3126, 3127, 3128, 3129, 3130, 3131, 3132, 3133, 3134, 3135, 3136, 3137, 3138, 3139, 3140, 3141, 3142, 3143, 3144, 3145, 3146, 3147, 3148, 3149, 3150, 3151, 3152, 3153, 3154, 3155, 3156, 3157, 3

0it [00:00, ?it/s]


Visiting https://www.domain.com.au/rent/?postcode=[8000, 8001, 8002, 8003, 8004, 8005, 8006, 8007, 8008, 8009, 8010, 8011, 8012, 8013, 8014, 8015, 8016, 8017, 8018, 8019, 8020, 8021, 8022, 8023, 8024, 8025, 8026, 8027, 8028, 8029, 8030, 8031, 8032, 8033, 8034, 8035, 8036, 8037, 8038, 8039, 8040, 8041, 8042, 8043, 8044, 8045, 8046, 8047, 8048, 8049, 8050, 8051, 8052, 8053, 8054, 8055, 8056, 8057, 8058, 8059, 8060, 8061, 8062, 8063, 8064, 8065, 8066, 8067, 8068, 8069, 8070, 8071, 8072, 8073, 8074, 8075, 8076, 8077, 8078, 8079, 8080, 8081, 8082, 8083, 8084, 8085, 8086, 8087, 8088, 8089, 8090, 8091, 8092, 8093, 8094, 8095, 8096, 8097, 8098, 8099, 8100, 8101, 8102, 8103, 8104, 8105, 8106, 8107, 8108, 8109, 8110, 8111, 8112, 8113, 8114, 8115, 8116, 8117, 8118, 8119, 8120, 8121, 8122, 8123, 8124, 8125, 8126, 8127, 8128, 8129, 8130, 8131, 8132, 8133, 8134, 8135, 8136, 8137, 8138, 8139, 8140, 8141, 8142, 8143, 8144, 8145, 8146, 8147, 8148, 8149, 8150, 8151, 8152, 8153, 8154, 8155, 8156, 8157, 8

0it [00:00, ?it/s]

Data saved to ../../data/raw/domain/all_postcode.csv





In [6]:
# download past data
url = "https://www.dffh.vic.gov.au/moving-annual-rents-suburb-march-quarter-2023-excel"

output_directory = "../../data/raw/domain/"
file_name = "past_data.xlsx"
output_path = os.path.join(output_directory, file_name)

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Write the content of the response to the specified path
    with open(output_path, 'wb') as file:
        file.write(response.content)
    print(f"File downloaded successfully and saved as {output_path}")
else:
    print(f"Failed to download the file. Status code: {response.status_code}")

File downloaded successfully and saved as ../../data/raw/domain/past_data.xlsx
