In [1]:
import re
import requests 
import json

## Get URLs from Xml Files

Due to restriction of Domain website, those xml file below were manually downloaded form [here](https://www.domain.com.au/sitemap-listings-rent.xml).
<br> With 10 rent listing files between **Aug 2024 and Sept 2023**.

In [5]:
def extract_property_urls_from_files(file_paths):
    '''
        This function is used to extract the property urls from xml files.
        And return list of urls.
    '''

    all_urls = []
    path = '../data/raw/internal/rent_listing/'

    for file_path in file_paths:
        # Read the file
        with open(path+file_path, 'r', encoding='utf-8') as file:
            content = file.read()

        # Pattern to match urls
        pattern = r"https://www\.domain\.com\.au/\d+-[a-z0-9-]+"

        # Find all matching urls
        urls = re.findall(pattern, content)
        
        # Add found urls to the combined list
        all_urls.extend(urls)

    return all_urls

# File names of rent listing that downloaded manually.
file_paths = ['sitemap-listings-rent-2024080105.xml','sitemap-listings-rent-2024061102.xml','sitemap-listings-rent-2024051404.xml','sitemap-listings-rent-2024041604.xml','sitemap-listings-rent-2024031803.xml','sitemap-listings-rent-2024021823.xml','sitemap-listings-rent-2023121920.xml','sitemap-listings-rent-2023112202.xml','sitemap-listings-rent-2023102603.xml','sitemap-listings-rent-2023092702.xml']
property_urls = extract_property_urls_from_files(file_paths)

print(len(property_urls))
print(property_urls[:5])

10376
['https://www.domain.com.au/322-peats-ferry-road-hornsby-nsw-2077-17151663', 'https://www.domain.com.au/9-5-17-queen-street-newtown-nsw-2042-17151662', 'https://www.domain.com.au/18-liney-st-clemton-park-campsie-nsw-2194-17151658', 'https://www.domain.com.au/18-liney-street-clemton-park-nsw-2206-17151656', 'https://www.domain.com.au/14-trebbiano-dr-cessnock-nsw-2325-17151654']


In [6]:
# Filter the urls with Victoria state
vic_property_urls = [url for url in property_urls if '-vic-' in url.lower()]
print("Rent Properties in VIC: ",len(vic_property_urls)) 

Rent Properties in VIC:  3277


In [7]:
def extract_property_id(url):
    '''
        This function is used to extract the property ID from url,
        and return the extracted id.
    '''

    # Pattern to match ID, last digits are ID
    pattern = r'-(\d+)$' 

    match = re.search(pattern, url)
    if match:
        return match.group(1)
    else: 
        print(url)
    
# Extract the property ID
property_ids = [extract_property_id(url) for url in vic_property_urls]

print('Valid property ID: ',len(property_ids)) 
print(property_ids[:5]) 

Valid property ID:  3277
['17151652', '17151645', '17151640', '17151631', '15741275']


## Get Rent Informations From Domain

In [8]:
# Ref https://www.youtube.com/watch?v=_OJBOy00IJ0  

def get_property_info(property_ids,client_id,client_secret):
    ''' 
        This function is used to get each properties information from Domain website
        with client_id and client_secret. And return a list of json file.
    '''

    # Scope to get data
    scopes = ['api_listings_read']
    auth_url = 'https://auth.domain.com.au/v1/connect/token'
    url_endpoint = 'https://api.domain.com.au/sandbox/v1/listings/'

    # Make requests using client_id and client_secret
    response = requests.post(auth_url, data = {
                        'client_id':client_id,
                        'client_secret':client_secret,
                        'grant_type':'client_credentials',
                        'scope':scopes,
                        'Content-Type':'text/json'
                        })

    # Get access_token
    json_res = response.json()
    access_token = json_res['access_token']
    auth = {'Authorization':'Bearer ' + access_token}

    property_data = []

    # Make call on each property and get peroperty information
    for property_id in property_ids:
        url = url_endpoint + property_id
        try:
            res = requests.get(url, headers=auth)
            res.raise_for_status()  
            data = res.json()
            property_data.append(data)
        except requests.exceptions.RequestException as e:
            print(f"Request failed for property ID {property_id}: {e}")

    return property_data


<div class="alert alert-block alert-danger">
<b>ATTENTION:</b> Please do not run this cell more than once per day. <br>Due to the limited number of calls allowed per project each day (a maximum of 500 requests per client_id and client_secret)
</div>

In [9]:
# Runs for more than 10 minutes !!!
client_ids = ['client_779c83047fc6ca8f40bc8f438f022a20', 'client_8131a96f4b9788629547fed330289174', 'client_b63adc07b03cfb151228e38bfc7f884a', 'client_337b6d727f0e8cebc52d473dab474d4b', 'client_98434d043f6920a97beed3c6564a8a9f', 'client_b3726f45a4fb88d106fb2e7dce3dfe39', 'client_e30997b3b9d0266799f39f230b725359', 'client_c4b9cea6cb4655b13c8ca915ec2c9ec1', 'client_269a4d07ff5c28ff77388fd783168f18']
secrets = ['secret_1a40ba6def78135d7bd368037f406c11', 'secret_7be2046a0900191b37139c4d03a615b4', 'secret_e6cd894a59b48952bdd9103826d73c11', 'secret_eaf458eda5303661594458bb8f079e68', 'secret_1136068dab63b2b287900893dfd0381f', 'secret_01dcf0ef4aa97d9a68cb857dbd1cb775', 'secret_de214f5f742836870d21016d65b03839', 'secret_bca57b96566b5974623601bfe7ed29f0', 'secret_ea7cb4223b4a8184ebdfe55bf5153683']

property_data = []
LENGTH_PER_CALL = 500

# Split ids into 6 parts, each time make 500 call
for i in range(6):
    property_data1 = get_property_info(property_ids[i*LENGTH_PER_CALL : (i+1)*LENGTH_PER_CALL], client_ids[i], secrets[i])
    print(f'Successfully retirve rows {i*LENGTH_PER_CALL} - {(i+1)*LENGTH_PER_CALL-1}, with length {len(property_data1)}')
    property_data.extend(property_data1)

# Get rest rows
property_data1 = get_property_info(property_ids[3000:],client_ids[i+1],secrets[i+1])
print(f'Successfully retirve rows {3000}-{len(property_ids)}, with length {len(property_data1)}')
property_data.extend(property_data1)


Successfully retirve rows 0 - 499, with length 500
Successfully retirve rows 500 - 999, with length 500
Successfully retirve rows 1000 - 1499, with length 500
Successfully retirve rows 1500 - 1999, with length 500
Successfully retirve rows 2000 - 2499, with length 500
Successfully retirve rows 2500 - 2999, with length 500
Successfully retirve rows 3000-3277, with length 277


In [12]:
# Write into json file for later cleaning
with open("../data/raw/internal/rent_info.json", 'w') as outfile:
    json.dump(property_data, outfile, indent=4)

# Check size of write in
with open('../data/raw/internal/rent_info.json') as f:
    rent_info = json.load(f)
len(rent_info)

3277