### This file is dedicated to scrape and download data from domain.com.au

Yuecheng Wang Aug 30, modify from ./scripts/scrape.py

In [1]:
# built-in imports
import pandas as pd
import re
from json import dump
from tqdm import tqdm

from collections import defaultdict
import urllib.request

# user packages
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import os

In [2]:
# constants
BASE_URL = "https://www.domain.com.au"

In [3]:
# set up different type of properties
properties = [
#             ("flat1b", "/rent/?ptype=apartment&bedrooms=1&price=50-any&sort=default-desc&state=vic", 50),
#             ("flat2b", "/rent/?ptype=apartment&bedrooms=2&price=50-any&sort=default-desc&state=vic", 20),
              ("flat3b", "/rent/?ptype=apartment&bedrooms=3&price=50-any&sort=default-desc&state=vic", 20),
              ("house2b", "/rent/?ptype=house&bedrooms=2&price=50-any&sort=default-desc&state=vic", 20),
              ("house3b", "/rent/?ptype=house&bedrooms=3&price=50-any&sort=default-desc&state=vic", 20),
              ("house4b", "/rent/?ptype=house&bedrooms=4&price=50-any&sort=default-desc&state=vic", 20)
             ]

In [4]:
# Save scraped data directly to CSV files
for ptype, purl, ppage in properties:
    url_links = []
    property_metadata = []
    N_PAGES = range(1, ppage + 1)
    
    # Generate list of URLs to visit
    for page in N_PAGES:
        url = BASE_URL + f"{purl}&page={page}"
        print(f"Visiting {url}")
        bs_object = BeautifulSoup(urlopen(Request(url, headers={'User-Agent': "PostmanRuntime/7.6.0"})), "html.parser")
    
        # Find property listing links
        index_links = bs_object.find("ul", {"data-testid": "results"}).findAll(
            "a", href=re.compile(f"{BASE_URL}/*")
        )
    
        for link in index_links:
            # If it's a property address, add it to the list
            if 'address' in link['class']:
                url_links.append(link['href'])
    
    # For each URL, scrape the property metadata
    pbar = tqdm(url_links)
    success_count, total_count = 0, 0
    for property_url in pbar:
        bs_object = BeautifulSoup(urlopen(Request(property_url, headers={'User-Agent': "PostmanRuntime/7.6.0"})), "html.parser")
        total_count += 1
        
        try: 
            # Extract property details
            address = bs_object.find("h1", {"class": "css-164r41r"}).text
            cost_text = bs_object.find("div", {"data-testid": "listing-details__summary-title"}).text
            
            # Extract rooms and parking details
            rooms = bs_object.find("div", {"data-testid": "property-features"}).findAll("span", {"data-testid": "property-features-text-container"})
            room_details = ', '.join([re.findall(r'\d+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'Bed' in feature.text or 'Bath' in feature.text])
            parking_details = ', '.join([re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'Parking' in feature.text])

            try:
                # Attempt to find the link that contains the coordinates
                href = bs_object.find("a", {"target": "_blank", 'rel': "noopener noreferrer"}).attrs['href']
                # Extract coordinates from the URL using regex
                coordinates = [float(coord) for coord in re.findall(r'destination=([-\s,\d\.]+)', href)[0].split(',')]
            except (AttributeError, IndexError, ValueError) as e:
                # If coordinates are not found or an error occurs, set coordinates to None
                coordinates = [None, None]

            # Append the flattened property data to the list
            property_metadata.append({
                'Address': address,
                'Cost': cost_text,
                'Rooms': room_details,
                'Parking': parking_details,
                'Coordinates': coordinates
            })
            success_count += 1
            
        except AttributeError:
            print(f"Issue with {property_url}")
    
        pbar.set_description(f"{(success_count / total_count * 100):.0f}% successful")

    # Save to CSV with one row per property
    output_relative_dir = '../../data/raw/domain/'
    if not os.path.exists(output_relative_dir):
        os.makedirs(output_relative_dir)

    csv_file_path = f'{output_relative_dir}{ptype}.csv'
    df = pd.DataFrame(property_metadata)
    df.to_csv(csv_file_path, index=False)
    print(f"Data saved to {csv_file_path}")

Visiting https://www.domain.com.au/rent/?ptype=apartment&bedrooms=3&price=50-any&sort=default-desc&state=vic&page=1
Visiting https://www.domain.com.au/rent/?ptype=apartment&bedrooms=3&price=50-any&sort=default-desc&state=vic&page=2
Visiting https://www.domain.com.au/rent/?ptype=apartment&bedrooms=3&price=50-any&sort=default-desc&state=vic&page=3
Visiting https://www.domain.com.au/rent/?ptype=apartment&bedrooms=3&price=50-any&sort=default-desc&state=vic&page=4
Visiting https://www.domain.com.au/rent/?ptype=apartment&bedrooms=3&price=50-any&sort=default-desc&state=vic&page=5
Visiting https://www.domain.com.au/rent/?ptype=apartment&bedrooms=3&price=50-any&sort=default-desc&state=vic&page=6
Visiting https://www.domain.com.au/rent/?ptype=apartment&bedrooms=3&price=50-any&sort=default-desc&state=vic&page=7
Visiting https://www.domain.com.au/rent/?ptype=apartment&bedrooms=3&price=50-any&sort=default-desc&state=vic&page=8
Visiting https://www.domain.com.au/rent/?ptype=apartment&bedrooms=3&pric

100% successful: 100%|████████████████████████████████████████████████████████████████| 401/401 [04:24<00:00,  1.51it/s]


Data saved to ../../data/raw/domain/flat3b.csv
Visiting https://www.domain.com.au/rent/?ptype=house&bedrooms=2&price=50-any&sort=default-desc&state=vic&page=1
Visiting https://www.domain.com.au/rent/?ptype=house&bedrooms=2&price=50-any&sort=default-desc&state=vic&page=2
Visiting https://www.domain.com.au/rent/?ptype=house&bedrooms=2&price=50-any&sort=default-desc&state=vic&page=3
Visiting https://www.domain.com.au/rent/?ptype=house&bedrooms=2&price=50-any&sort=default-desc&state=vic&page=4
Visiting https://www.domain.com.au/rent/?ptype=house&bedrooms=2&price=50-any&sort=default-desc&state=vic&page=5
Visiting https://www.domain.com.au/rent/?ptype=house&bedrooms=2&price=50-any&sort=default-desc&state=vic&page=6
Visiting https://www.domain.com.au/rent/?ptype=house&bedrooms=2&price=50-any&sort=default-desc&state=vic&page=7
Visiting https://www.domain.com.au/rent/?ptype=house&bedrooms=2&price=50-any&sort=default-desc&state=vic&page=8
Visiting https://www.domain.com.au/rent/?ptype=house&bedr

100% successful: 100%|████████████████████████████████████████████████████████████████| 401/401 [04:25<00:00,  1.51it/s]


Data saved to ../../data/raw/domain/house2b.csv
Visiting https://www.domain.com.au/rent/?ptype=house&bedrooms=3&price=50-any&sort=default-desc&state=vic&page=1
Visiting https://www.domain.com.au/rent/?ptype=house&bedrooms=3&price=50-any&sort=default-desc&state=vic&page=2
Visiting https://www.domain.com.au/rent/?ptype=house&bedrooms=3&price=50-any&sort=default-desc&state=vic&page=3
Visiting https://www.domain.com.au/rent/?ptype=house&bedrooms=3&price=50-any&sort=default-desc&state=vic&page=4
Visiting https://www.domain.com.au/rent/?ptype=house&bedrooms=3&price=50-any&sort=default-desc&state=vic&page=5
Visiting https://www.domain.com.au/rent/?ptype=house&bedrooms=3&price=50-any&sort=default-desc&state=vic&page=6
Visiting https://www.domain.com.au/rent/?ptype=house&bedrooms=3&price=50-any&sort=default-desc&state=vic&page=7
Visiting https://www.domain.com.au/rent/?ptype=house&bedrooms=3&price=50-any&sort=default-desc&state=vic&page=8
Visiting https://www.domain.com.au/rent/?ptype=house&bed

100% successful: 100%|████████████████████████████████████████████████████████████████| 401/401 [04:18<00:00,  1.55it/s]


Data saved to ../../data/raw/domain/house3b.csv
Visiting https://www.domain.com.au/rent/?ptype=house&bedrooms=4&price=50-any&sort=default-desc&state=vic&page=1
Visiting https://www.domain.com.au/rent/?ptype=house&bedrooms=4&price=50-any&sort=default-desc&state=vic&page=2
Visiting https://www.domain.com.au/rent/?ptype=house&bedrooms=4&price=50-any&sort=default-desc&state=vic&page=3
Visiting https://www.domain.com.au/rent/?ptype=house&bedrooms=4&price=50-any&sort=default-desc&state=vic&page=4
Visiting https://www.domain.com.au/rent/?ptype=house&bedrooms=4&price=50-any&sort=default-desc&state=vic&page=5
Visiting https://www.domain.com.au/rent/?ptype=house&bedrooms=4&price=50-any&sort=default-desc&state=vic&page=6
Visiting https://www.domain.com.au/rent/?ptype=house&bedrooms=4&price=50-any&sort=default-desc&state=vic&page=7
Visiting https://www.domain.com.au/rent/?ptype=house&bedrooms=4&price=50-any&sort=default-desc&state=vic&page=8
Visiting https://www.domain.com.au/rent/?ptype=house&bed

100% successful: 100%|████████████████████████████████████████████████████████████████| 401/401 [04:24<00:00,  1.52it/s]

Data saved to ../../data/raw/domain/house4b.csv



