In [1]:
import json
import asyncio
import jmespath
from httpx import AsyncClient, Response
from parsel import Selector
from typing import List, Dict
import random
import time
import requests
from bs4 import BeautifulSoup

In [4]:
client = AsyncClient(
    http2=True,
    follow_redirects=True,  # Enable following redirects
    headers={
        "accept-language": "en-US,en;q=0.9",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    }
)


def parse_property_page(data: Dict) -> Dict:
    if not data:
        return
    result = jmespath.search(
        """{
    listingId: listingId,
    listingUrl: listingUrl,
    unitNumber: unitNumber,
    streetNumber: streetNumber,
    street: street,
    suburb: suburb,
    postcode: postcode,
    createdOn: createdOn,
    propertyType: propertyType,
    beds: beds,
    phone: phone,
    agencyName: agencyName,
    propertyDeveloperName: propertyDeveloperName,
    agencyProfileUrl: agencyProfileUrl,
    propertyDeveloperUrl: propertyDeveloperUrl,
    description: description,
    loanfinder: loanfinder,
    schools: schoolCatchment.schools,
    suburbInsights: suburbInsights,
    gallery: gallery,
    listingSummary: listingSummary,
    agents: agents,
    features: features,
    structuredFeatures: structuredFeatures,
    faqs: faqs
    }""",
        data,
    )
    return result

def parse_hidden_data(response: Response):
    selector = Selector(response.text)
    script = selector.xpath("//script[@id='__NEXT_DATA__']/text()").get()
    data = json.loads(script)
    return data["props"]["pageProps"]["componentProps"]

async def scrape_properties(urls: List[str]) -> List[Dict]:
    to_scrape = [client.get(url) for url in urls]
    properties = []
    for response in asyncio.as_completed(to_scrape):
        response = await response
        if response.status_code != 200:
            print(f"Request blocked with status code {response.status_code}. Retrying...")
            continue
        data = parse_hidden_data(response)
        data = parse_property_page(data)
        properties.append(data)
        # Introduce a random delay between requests to avoid blocking
        time.sleep(random.uniform(1, 3))
    print(f"scraped {len(properties)} property listings")
    return properties 

async def run():
    data = await scrape_properties(
        urls = [
            "https://www.domain.com.au/610-399-bourke-street-melbourne-vic-3000-2018835548"
        ]
    )
    print(json.dumps(data, indent=2))

await run()

KeyError: 'componentProps'

In [5]:
api_key = 'key_be7428b4085e9bf2e77104d67ab79c2c'

# API endpoint
url = 'https://api.domain.com.au/v1/listings/residential/_search'

# Headers
headers = {
    'Authorization': f'Bearer {api_key}',
    'Content-Type': 'application/json'
}

# Payload for properties in Victoria
payload = {
    "listingType": "Sale",
    "propertyTypes": ["House", "ApartmentUnitFlat"],
    "locations": [
        {
            "state": "VIC",  # Victoria state code
            "region": "",
            "area": "",
            "suburb": "",  # Can be left empty to fetch properties from all suburbs in Victoria
            "postCode": ""  # Can be specified if looking for a particular postcode
        }
    ],
    "minBedrooms": 2,
    "maxBedrooms": 4,
    "pageSize": 50,  # Number of listings per page
    "pageNumber": 1
}

# Function to fetch and process the data
def fetch_properties_victoria():
    properties = []
    page_number = 1
    while True:
        payload['pageNumber'] = page_number
        response = requests.post(url, headers=headers, json=payload)
        if response.status_code == 200:
            listings = response.json()
            if not listings:  # Break if no more listings
                break

            for listing in listings:
                property_info = {
                    'Listing ID': listing.get('listingId', 'N/A'),
                    'Price': listing.get('priceDetails', {}).get('displayPrice', 'N/A'),
                    'Address': listing.get('addressParts', {}).get('displayAddress', 'N/A'),
                    'Bedrooms': listing.get('bedrooms', 'N/A'),
                    'Bathrooms': listing.get('bathrooms', 'N/A'),
                    'Car Spaces': listing.get('carspaces', 'N/A'),
                }
                properties.append(property_info)

            page_number += 1
        else:
            print(f"Failed to fetch data: {response.status_code} - {response.text}")
            break
    return properties

# Fetch data
properties_vic = fetch_properties_victoria()

# Convert to DataFrame
df = pd.DataFrame(properties_vic)

# Save to CSV
df.to_csv('domain_victoria_properties.csv', index=False)

print("Data saved to 'domain_victoria_properties.csv'")


NameError: name 'requests' is not defined