In [1]:
import json
import asyncio
import jmespath
import os
from httpx import AsyncClient, Response
from parsel import Selector
from typing import List, Dict
import random
import time
import requests

In [4]:
# one single property list
client = AsyncClient(
    # enable http2
    http2=True,
    # add basic browser headers to mimize blocking chancesd
    headers={
        "accept-language": "en-US,en;q=0.9",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "accept-language": "en-US;en;q=0.9",
        "accept-encoding": "gzip, deflate, br",
    }
)

def parse_property_page(data: Dict) -> Dict:
    if not data:
        return
    result = jmespath.search(
        """{
    listingId: listingId,
    listingUrl: listingUrl,
    unitNumber: unitNumber,
    streetNumber: streetNumber,
    street: street,
    suburb: suburb,
    postcode: postcode,
    createdOn: createdOn,
    propertyType: propertyType,
    beds: beds,
    phone: phone,
    agencyName: agencyName,
    propertyDeveloperName: propertyDeveloperName,
    agencyProfileUrl: agencyProfileUrl,
    propertyDeveloperUrl: propertyDeveloperUrl,
    description: description,
    loanfinder: loanfinder,
    schools: schoolCatchment.schools,
    suburbInsights: suburbInsights,
    gallery: gallery,
    listingSummary: listingSummary,
    agents: agents,
    features: features,
    structuredFeatures: structuredFeatures,
    faqs: faqs
    }""",
        data,
    )
    return result

def parse_hidden_data(response: Response):
    selector = Selector(response.text)
    script = selector.xpath("//script[@id='__NEXT_DATA__']/text()").get()
    data = json.loads(script)
    return data["props"]["pageProps"]["componentProps"]

async def scrape_properties(urls: List[str]) -> List[Dict]:
    to_scrape = [client.get(url) for url in urls]
    properties = []
    for response in asyncio.as_completed(to_scrape):
        response = await response
        if response.status_code != 200:
            print(f"Request blocked with status code {response.status_code}. Retrying...")
            continue
        data = parse_hidden_data(response)
        data = parse_property_page(data)
        properties.append(data)
        # Introduce a random delay between requests to avoid blocking
        time.sleep(random.uniform(1, 3))
    print(f"scraped {len(properties)} property listings")
    return properties 


async def run():
    data = await scrape_properties(
        urls = [
            "https://www.domain.com.au/906-238-flinders-street-melbourne-vic-3000-14781843"
        ]
    )
    print(json.dumps(data, indent=2))

await run()

scraped 1 property listings
[
  {
    "listingId": 14781843,
    "listingUrl": "https://www.domain.com.au/906-238-flinders-street-melbourne-vic-3000-14781843",
    "unitNumber": "906",
    "streetNumber": "238",
    "street": "Flinders Street",
    "suburb": "Melbourne",
    "postcode": "3000",
    "createdOn": "2020-12-20T20:35:58.89",
    "propertyType": "Apartment / Unit / Flat",
    "beds": 1,
    "phone": "0433104343",
    "agencyName": "Melbourne Residential Real Estate",
    "propertyDeveloperName": "Melbourne Residential Real Estate",
    "agencyProfileUrl": "https://www.domain.com.au/real-estate-agencies/melbourneresidentialrealestate-22398/",
    "propertyDeveloperUrl": "https://www.domain.com.au/real-estate-agencies/melbourneresidentialrealestate-22398/",
    "description": [
      "This furnished large size studio has a lot to offer.",
      "",
      " Features include a fully equipped kitchen with gas cooking stove, new fridge, a good size open dining/ living area, wall t

In [2]:
# search page
client = AsyncClient(
    # enable http2
    http2=True,
    # add basic browser headers to mimize blocking chancesd
    headers={
        "accept-language": "en-US,en;q=0.9",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "accept-language": "en-US;en;q=0.9",
        "accept-encoding": "gzip, deflate, br",
    }
)

def parse_search_page(data):
    """refine search pages data"""
    if not data:
        return    
    data = data["listingsMap"]
    result = []
    # iterate over card items in the search data
    for key in data.keys():
        item = data[key]
        parsed_data = jmespath.search(
            """{
        id: id,
        listingType: listingType,
        listingModel: listingModel
      }""",
        item,
        )
        # execulde the skeletonImages key from the data
        parsed_data["listingModel"].pop("skeletonImages")
        result.append(parsed_data)
    return result


def parse_hidden_data(response: Response):
    """parse json data from script tags"""
    selector = Selector(response.text)
    script = selector.xpath("//script[@id='__NEXT_DATA__']/text()").get()
    data = json.loads(script)
    with open("newdata.json", "w") as file:
        json.dump(data, file, indent=2)
    return data["props"]["pageProps"]["componentProps"]


async def scrape_search(url: str, max_scrape_pages: int = None) -> List[Dict]:
    """scrape property listings from search pages"""
    first_page = await client.get(url)
    print("scraping search page {}", url)
    data = parse_hidden_data(first_page)
    search_data = parse_search_page(data)
    # get the number of maximum search pages
    max_search_pages = data["totalPages"]
    # scrape all available pages if not max_scrape_pages or max_scrape_pages >= max_search_pages
    if max_scrape_pages and max_scrape_pages < max_search_pages:
        max_scrape_pages = max_scrape_pages
    else:
        max_scrape_pages = max_search_pages
    print(f"scraping search pagination, remaining ({max_scrape_pages - 1} more pages)")
    # add the remaining search pages to a scraping list
    other_pages = [client.get(str(first_page.url) + f"?page={page}") for page in range(2, max_scrape_pages + 1)]
    # scrape the remaining search pages concurrently
    for response in asyncio.as_completed(other_pages):
        response = await response
        assert response.status_code == 200, "request has been blocked"        
        # parse the data from script tag        
        data = parse_hidden_data(response)
        # aappend the data to the list after refining        
        search_data.extend(parse_search_page(data))
    print(f"scraped ({len(search_data)}) from {url}")
    return search_data



In [8]:
async def run():
    data = await scrape_search(
        url = "https://www.domain.com.au/rent/melbourne-vic-3000/",
        max_scrape_pages = 500
    )
    file_name = "property_data_VIC.json"
    with open(file_name, 'w') as json_file:
        json.dump(data, json_file, indent=4)

await run()    

scraping search page {} https://www.domain.com.au/rent/vic/
scraping search pagination, remaining (49 more pages)
scraped (1001) from https://www.domain.com.au/rent/vic/


In [9]:
# List of suburb URLs
suburb_urls = [
    "https://www.domain.com.au/rent/melbourne-vic-3000/",
    "https://www.domain.com.au/sale/carlton-vic-3053/",
    "https://www.domain.com.au/sale/west-melbourne-vic-3003/",
    "https://www.domain.com.au/sale/south-yarra-vic-3141/",
    "https://www.domain.com.au/sale/hawthorn-vic-3122/",
    "https://www.domain.com.au/sale/footscray-vic-3011/",
    "https://www.domain.com.au/sale/hawthorn-vic-3122/",
    "https://www.domain.com.au/sale/brunswick-vic-3056/"
    
]

async def scrape_and_save(url: str, max_scrape_pages: int):
    """Scrape property data from a given URL and save it to a JSON file."""
    data = await scrape_search(url=url, max_scrape_pages=max_scrape_pages)
    
    # Generate a file name based on the suburb name
    suburb_name = url.split('/')[-2]  # Extracts 'melbourne-vic-3000' from URL
    file_name = f"property_data_{suburb_name}.json"
    
    # Save the data to a JSON file
    with open(file_name, 'w') as json_file:
        json.dump(data, json_file, indent=4)
    
    print(f"Scraped and saved data for {suburb_name}")

async def run():
    """Scrape data for multiple suburbs concurrently."""
    tasks = [scrape_and_save(url, max_scrape_pages=50) for url in suburb_urls]
    await asyncio.gather(*tasks)

# Start the scraping process
await run()

scraping search page {} https://www.domain.com.au/rent/melbourne-vic-3000/
scraping search pagination, remaining (49 more pages)
scraping search page {} https://www.domain.com.au/sale/carlton-vic-3053/
scraping search pagination, remaining (49 more pages)
scraping search page {} https://www.domain.com.au/sale/south-yarra-vic-3141/
scraping search pagination, remaining (49 more pages)
scraping search page {} https://www.domain.com.au/sale/west-melbourne-vic-3003/
scraping search pagination, remaining (49 more pages)
scraping search page {} https://www.domain.com.au/sale/footscray-vic-3011/
scraping search pagination, remaining (43 more pages)
scraping search page {} https://www.domain.com.au/sale/brunswick-vic-3056/
scraping search pagination, remaining (24 more pages)
scraping search page {} https://www.domain.com.au/sale/hawthorn-vic-3122/
scraping search pagination, remaining (39 more pages)
scraping search page {} https://www.domain.com.au/sale/hawthorn-vic-3122/
scraping search pag

In [3]:
with open('property_data_west-melbourne-vic-3003.json', 'r') as file:
    json_files = json.load(file)
print(json_files)

[{'id': 2018553653, 'listingType': 'listing', 'listingModel': {'promoType': 'standardpp', 'url': '/504-112-adderley-street-west-melbourne-vic-3003-2018553653', 'images': ['https://rimh2.domainstatic.com.au/v7igJMTY9x0rA6C67vvp7znqNRY=/660x440/filters:format(jpeg):quality(80):no_upscale()/2018553653_8_1_230614_090013-w3839-h2560', 'https://rimh2.domainstatic.com.au/O1zepasDBkKcrYnVaQT5xl2c9vw=/660x440/filters:format(jpeg):quality(80):no_upscale()/2018553653_1_1_230614_090013-w3839-h2560', 'https://rimh2.domainstatic.com.au/xUpDQlx2rbOY-cCrecSWiv1gxNI=/660x440/filters:format(jpeg):quality(80):no_upscale()/2018553653_1_1_230526_124615-w3839-h2560', 'https://rimh2.domainstatic.com.au/wLmM3g74hSm6EOkzWzq9gdj7JDE=/660x440/filters:format(jpeg):quality(80):no_upscale()/2018553653_19_1_230526_124615-w3839-h2560', 'https://rimh2.domainstatic.com.au/TMyuyTOsQvbxCVtvp9O6MJKbMQI=/660x440/filters:format(jpeg):quality(80):no_upscale()/2018553653_5_1_230614_090013-w3839-h2560', 'https://rimh2.domainst

In [18]:
import pandas as pd
import json

with open('property_data_west-melbourne-vic-3003.json', 'r') as file:
    west_melbourne = json.load(file)

with open('property_data_south-yarra-vic-3141.json', 'r') as file:
    south_yarra = json.load(file)

with open('property_data_melbourne-vic-3000.json', 'r') as file:
    melbourne = json.load(file)

with open('property_data_hawthorn-vic-3122.json', 'r') as file:
    hawthorn = json.load(file)

with open('property_data_footscray-vic-3011.json', 'r') as file:
    footscray = json.load(file)

with open('property_data_carlton-vic-3053.json', 'r') as file:
    carlton = json.load(file)

with open('property_data_brunswick-vic-3056.json', 'r') as file:
    brunswick = json.load(file)


# Extract features from 'listingModel'
def extract_listing_features(json_data):
    data = []
    for item in json_data:
        if 'listingModel' in item:
            listing_model = item['listingModel']
            # Extracting top-level features
            features = {
                'id': item.get('id'),
                'promoType': listing_model.get('promoType'),
                'price':listing_model.get('price'),
                'hasVideo':listing_model.get('hasVideo'),
                'agentNames':listing_model.get('branding', {}).get('agentNames'),
                'brandName':listing_model.get('branding', {}).get('brandName'),
                'addressStreet': listing_model.get('address', {}).get('street'),
                'addressSuburb': listing_model.get('address', {}).get('suburb'),
                'addressState': listing_model.get('address', {}).get('state'),
                'addressPostcode': listing_model.get('address', {}).get('postcode'),
                'addressLat': listing_model.get('address', {}).get('lat'),
                'addressLng': listing_model.get('address', {}).get('lng'),
                'num_bath': listing_model.get('features',{}).get('baths'),
                'type': listing_model.get('features', {}).get('propertyType'),
                'formatted': listing_model.get('features', {}).get('propertyTypeFormatted'),
                'isRural': listing_model.get('features', {}).get('isRural'),
                'landSize': listing_model.get('features', {}).get('landSize'),
                'Retirement': listing_model.get('features', {}).get('isRetirement')
            }
            data.append(features)
    return pd.DataFrame(data)

# Convert JSON to DataFrame
west_melbourne_df = extract_listing_features(west_melbourne)
south_yarra_df = extract_listing_features(south_yarra)
melbourne_df = extract_listing_features(melbourne)
hawthorn_df = extract_listing_features(hawthorn)
footscray_df = extract_listing_features(footscray)
carlton_df = extract_listing_features(carlton)
brunswick_df = extract_listing_features(brunswick)


result = pd.concat([west_melbourne_df, south_yarra_df,melbourne_df,hawthorn_df,footscray_df,carlton_df,
                    brunswick_df], ignore_index=True)
# Display DataFrame
result.to_csv('../data/raw/domain.csv', index=False)

In [26]:
result['addressPostcode'].nunique()

35