In [29]:
import json
import asyncio
import jmespath
from httpx import AsyncClient, Response
from parsel import Selector
from typing import List, Dict

client = AsyncClient(
    # enable http2
    http2=True,
    # add basic browser headers to mimize blocking chancesd
    headers={
        "accept-language": "en-US,en;q=0.9",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "accept-language": "en-US;en;q=0.9",
        "accept-encoding": "gzip, deflate, br",
    }
)


def parse_property_page(data: Dict) -> Dict:
    """refine property pages data"""
    if not data:
        return
    result = jmespath.search(
        """{
    listingId: listingId,
    listingUrl: listingUrl,
    unitNumber: unitNumber,
    streetNumber: streetNumber,
    street: street,
    suburb: suburb,
    postcode: postcode,
    createdOn: createdOn,
    propertyType: propertyType,
    beds: beds,
    phone: phone,
    agencyName: agencyName,
    propertyDeveloperName: propertyDeveloperName,
    agencyProfileUrl: agencyProfileUrl,
    propertyDeveloperUrl: propertyDeveloperUrl,
    description: description,
    loanfinder: loanfinder,
    schools: schoolCatchment.schools,
    suburbInsights: suburbInsights,
    gallery: gallery,
    listingSummary: listingSummary,
    agents: agents,
    features: features,
    structuredFeatures: structuredFeatures,
    faqs: faqs
    }""",
        data,
    )
    return result


def parse_hidden_data(response: Response):
    """parse json data from script tags"""
    selector = Selector(response.text)
    script = selector.xpath("//script[@id='__NEXT_DATA__']/text()").get()
    data = json.loads(script)
    return data["props"]["pageProps"]["componentProps"]


async def scrape_properties(urls: List[str]) -> List[Dict]:
    """scrape listing data from property pages"""
    # add the property page URLs to a scraping list
    to_scrape = [client.get(url) for url in urls]
    properties = []
    # scrape all the property page concurrently
    for response in asyncio.as_completed(to_scrape):
        response = await response
        assert response.status_code == 200, "request has been blocked"
        data = parse_hidden_data(response)
        data = parse_property_page(data)
        properties.append(data)
    print(f"scraped {len(properties)} property listings")
    return properties 

In [30]:
urls = [
            "https://www.domain.com.au/906-238-flinders-street-melbourne-vic-3000-14781843",
            "https://www.domain.com.au/715-39-lonsdale-st-melbourne-vic-3000-15483048",
            "https://www.domain.com.au/car-park-228-la-trobe-st-melbourne-vic-3000-17190068"
        ]

In [31]:
async def run():
    data = await scrape_properties(
        urls = urls
    )
    # print the data in JSON format
    print(json.dumps(data, indent=2))

In [32]:
await run()

scraped 3 property listings
[
  {
    "listingId": 14781843,
    "listingUrl": "https://www.domain.com.au/906-238-flinders-street-melbourne-vic-3000-14781843",
    "unitNumber": "906",
    "streetNumber": "238",
    "street": "Flinders Street",
    "suburb": "Melbourne",
    "postcode": "3000",
    "createdOn": "2020-12-20T20:35:58.89",
    "propertyType": "Apartment / Unit / Flat",
    "beds": 1,
    "phone": "0433104343",
    "agencyName": "Melbourne Residential Real Estate",
    "propertyDeveloperName": "Melbourne Residential Real Estate",
    "agencyProfileUrl": "https://www.domain.com.au/real-estate-agencies/melbourneresidentialrealestate-22398/",
    "propertyDeveloperUrl": "https://www.domain.com.au/real-estate-agencies/melbourneresidentialrealestate-22398/",
    "description": [
      "This furnished large size studio has a lot to offer.",
      "",
      " Features include a fully equipped kitchen with gas cooking stove, new fridge, a good size open dining/ living area, wall t