In [2]:
import json
import pandas as pd
from pathlib import Path
import asyncio
import jmespath
import os
from httpx import AsyncClient, Response
from parsel import Selector
from typing import List, Dict
import random
import time
import requests

In [4]:
#   https://scrapfly.io/blog/how-to-scrape-domain-com-au-real-estate-property-data/

# search page
client = AsyncClient(
    # enable http2
    http2=True,
    # add basic browser headers to mimize blocking chancesd
    headers={
        "accept-language": "en-US,en;q=0.9",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "accept-language": "en-US;en;q=0.9",
        "accept-encoding": "gzip, deflate, br",
    }
)

def parse_search_page(data):
    """refine search pages data"""
    if not data:
        return    
    data = data["listingsMap"]
    result = []
    # iterate over card items in the search data
    for key in data.keys():
        item = data[key]
        parsed_data = jmespath.search(
            """{
        id: id,
        listingType: listingType,
        listingModel: listingModel
      }""",
        item,
        )
        # execulde the skeletonImages key from the data
        parsed_data["listingModel"].pop("skeletonImages")
        result.append(parsed_data)
    return result


def parse_hidden_data(response: Response):
    """parse json data from script tags"""
    selector = Selector(response.text)
    script = selector.xpath("//script[@id='__NEXT_DATA__']/text()").get()
    data = json.loads(script)
    with open("newdata.json", "w") as file:
        json.dump(data, file, indent=2)
    return data["props"]["pageProps"]["componentProps"]


async def scrape_search(url: str, max_scrape_pages: int = None) -> List[Dict]:
    """scrape property listings from search pages"""
    first_page = await client.get(url)
    print("scraping search page {}", url)
    data = parse_hidden_data(first_page)
    search_data = parse_search_page(data)
    # get the number of maximum search pages
    max_search_pages = data["totalPages"]
    # scrape all available pages if not max_scrape_pages or max_scrape_pages >= max_search_pages
    if max_scrape_pages and max_scrape_pages < max_search_pages:
        max_scrape_pages = max_scrape_pages
    else:
        max_scrape_pages = max_search_pages
    print(f"scraping search pagination, remaining ({max_scrape_pages - 1} more pages)")
    # add the remaining search pages to a scraping list
    other_pages = [client.get(str(first_page.url) + f"?page={page}") for page in range(2, max_scrape_pages + 1)]
    # scrape the remaining search pages concurrently
    for response in asyncio.as_completed(other_pages):
        response = await response
        assert response.status_code == 200, "request has been blocked"        
        # parse the data from script tag        
        data = parse_hidden_data(response)
        # aappend the data to the list after refining        
        search_data.extend(parse_search_page(data))
    print(f"scraped ({len(search_data)}) from {url}")
    return search_data



In [7]:
# List of suburb URLs
suburb_urls = [
    "https://www.domain.com.au/rent/melbourne-vic-3000/",
    "https://www.domain.com.au/sale/carlton-vic-3053/",
    "https://www.domain.com.au/rent/west-melbourne-vic-3003/",
    "https://www.domain.com.au/rent/south-yarra-vic-3141/",
    "https://www.domain.com.au/rent/hawthorn-vic-3122/",
    "https://www.domain.com.au/rent/footscray-vic-3011/",
    "https://www.domain.com.au/rent/brunswick-vic-3056/",
    "https://www.domain.com.au/rent/richmond-vic-3121/",
    "https://www.domain.com.au/rent/box-hill-vic-3128/",
    "https://www.domain.com.au/rent/werribee-vic-3030/",
    "https://www.domain.com.au/rent/coburg-vic-3058/",
    "https://www.domain.com.au/rent/geelong-vic-3220/",    
]

async def scrape_and_save(url: str, max_scrape_pages: int):
    """Scrape property data from a given URL and save it to a JSON file."""
    data = await scrape_search(url=url, max_scrape_pages=max_scrape_pages)
    
    # Generate a file name based on the suburb name
    suburb_name = url.split('/')[-2]  # Extracts 'melbourne-vic-3000' from URL
    

    # Define the directory path where you want to save the JSON files
    directory_path = Path("../data/raw/property_data_json")

    # Create the directory if it doesn't exist
    directory_path.mkdir(parents=True, exist_ok=True)

    # Define the file name with the suburb name
    file_name = f"property_data_{suburb_name}.json"

    # Combine the directory path and file name
    file_path = directory_path / file_name

    # Save the data to a JSON file at the specified path
    with open(file_path, 'w') as json_file:
        json.dump(data, json_file, indent=4)

    print(f"Scraped and saved data for {suburb_name} to {file_path}")


async def run():
    """Scrape data for multiple suburbs concurrently."""
    tasks = [scrape_and_save(url, max_scrape_pages=50) for url in suburb_urls]
    await asyncio.gather(*tasks)

# Start the scraping process
await run()

scraping search page {} https://www.domain.com.au/rent/werribee-vic-3030/
scraping search pagination, remaining (44 more pages)
scraping search page {} https://www.domain.com.au/rent/box-hill-vic-3128/
scraping search pagination, remaining (13 more pages)
scraping search page {} https://www.domain.com.au/rent/coburg-vic-3058/
scraping search pagination, remaining (22 more pages)
scraping search page {} https://www.domain.com.au/rent/geelong-vic-3220/
scraping search pagination, remaining (12 more pages)
scraping search page {} https://www.domain.com.au/rent/melbourne-vic-3000/
scraping search pagination, remaining (49 more pages)
scraping search page {} https://www.domain.com.au/rent/footscray-vic-3011/
scraping search pagination, remaining (28 more pages)
scraping search page {} https://www.domain.com.au/rent/richmond-vic-3121/
scraping search pagination, remaining (49 more pages)
scraping search page {} https://www.domain.com.au/rent/west-melbourne-vic-3003/
scraping search paginatio

In [8]:
with open('../data/raw/property_data_json/property_data_west-melbourne-vic-3003.json', 'r') as file:
    west_melbourne = json.load(file)

with open('../data/raw/property_data_json/property_data_south-yarra-vic-3141.json', 'r') as file:
    south_yarra = json.load(file)

with open('../data/raw/property_data_json/property_data_melbourne-vic-3000.json', 'r') as file:
    melbourne = json.load(file)

with open('../data/raw/property_data_json/property_data_hawthorn-vic-3122.json', 'r') as file:
    hawthorn = json.load(file)

with open('../data/raw/property_data_json/property_data_footscray-vic-3011.json', 'r') as file:
    footscray = json.load(file)

with open('../data/raw/property_data_json/property_data_carlton-vic-3053.json', 'r') as file:
    carlton = json.load(file)

with open('../data/raw/property_data_json/property_data_brunswick-vic-3056.json', 'r') as file:
    brunswick = json.load(file)

with open('../data/raw/property_data_json/property_data_box-hill-vic-3128.json', 'r') as file:
    box_hill = json.load(file)

with open('../data/raw/property_data_json/property_data_coburg-vic-3058.json', 'r') as file:
    coburg = json.load(file)

with open('../data/raw/property_data_json/property_data_geelong-vic-3220.json', 'r') as file:
    geelong = json.load(file)

with open('../data/raw/property_data_json/property_data_richmond-vic-3121.json', 'r') as file:
    richmond = json.load(file)

with open('../data/raw/property_data_json/property_data_werribee-vic-3030.json', 'r') as file:
    werribee = json.load(file)


# Extract features from 'listingModel'
def extract_listing_features(json_data):
    data = []
    for item in json_data:
        if 'listingModel' in item:
            listing_model = item['listingModel']
            # Extracting top-level features
            features = {
                'id': item.get('id'),
                'promoType': listing_model.get('promoType'),
                'price':listing_model.get('price'),
                'hasVideo':listing_model.get('hasVideo'),
                'agentNames':listing_model.get('branding', {}).get('agentNames'),
                'brandName':listing_model.get('branding', {}).get('brandName'),
                'addressStreet': listing_model.get('address', {}).get('street'),
                'addressSuburb': listing_model.get('address', {}).get('suburb'),
                'addressState': listing_model.get('address', {}).get('state'),
                'addressPostcode': listing_model.get('address', {}).get('postcode'),
                'addressLat': listing_model.get('address', {}).get('lat'),
                'addressLng': listing_model.get('address', {}).get('lng'),
                'num_bath': listing_model.get('features',{}).get('baths'),
                'type': listing_model.get('features', {}).get('propertyType'),
                'formatted': listing_model.get('features', {}).get('propertyTypeFormatted'),
                'isRural': listing_model.get('features', {}).get('isRural'),
                'landSize': listing_model.get('features', {}).get('landSize'),
                'Retirement': listing_model.get('features', {}).get('isRetirement')
            }
            data.append(features)
    return pd.DataFrame(data)

# Convert JSON to DataFrame
west_melbourne_df = extract_listing_features(west_melbourne)
south_yarra_df = extract_listing_features(south_yarra)
melbourne_df = extract_listing_features(melbourne)
hawthorn_df = extract_listing_features(hawthorn)
footscray_df = extract_listing_features(footscray)
carlton_df = extract_listing_features(carlton)
brunswick_df = extract_listing_features(brunswick)
box_hill_df = extract_listing_features(box_hill)
coburg_df = extract_listing_features(coburg)
geelong_df = extract_listing_features(geelong)
richmond_df = extract_listing_features(richmond)
werribee_df = extract_listing_features(werribee)

result = pd.concat([west_melbourne_df, south_yarra_df,melbourne_df,hawthorn_df,footscray_df,carlton_df,
                    brunswick_df,box_hill_df,coburg_df,geelong_df,richmond_df,werribee_df], ignore_index=True)
# Display DataFrame
result.to_csv('../data/raw/domain.csv', index=False)