# Import libraries

In [1]:
# built-in imports
import re
import time
import pandas as pd
from json import dump
from collections import defaultdict

# user packages
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen

# Get the postcode of VIC

In [19]:
# read the postcode data and extract VIC postcode
df_postcode = pd.read_csv("../data/raw/australian_postcodes.csv")
vic_postcode = df_postcode[df_postcode["state"] == "VIC"].reset_index(drop=True)
vic_postcode = vic_postcode["postcode"]
unique_postcode = vic_postcode.drop_duplicates().reset_index(drop=True)

# Scrape rent data from domain website

In [47]:
# constants
BASE_URL = "https://www.domain.com.au"
NUM_PROPERTY_PER_PAGE = 20

# begin code
url_links = []
property_metadata = defaultdict(dict)

# generate list of urls to visit according to the postcode
for postcode in unique_postcode:
    headers = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"}
    url = BASE_URL + f"/rent/?postcode={postcode}"
    bs_object = BeautifulSoup(requests.get(url, headers=headers).text, "html.parser")
    print(postcode)
    
    # get the number of rent property for each postcode
    property_info = bs_object \
        .find(
            "h1",
            {"data-testid": "summary"}
        ).find("strong").text
    property_num = int(re.findall("(\d+)\s", property_info)[0])

    # calculate the total number of pages for each postcode
    if (property_num > 0) and (property_num < NUM_PROPERTY_PER_PAGE):
        num_page = 1
    elif (property_num > 0):
        if (property_num % NUM_PROPERTY_PER_PAGE == 0):
            num_page = property_num // NUM_PROPERTY_PER_PAGE
        else:
            num_page = property_num // NUM_PROPERTY_PER_PAGE + 1

    # do not consider the empty pages
    if (property_num > 0):
        for page in range(1, num_page + 1):
            url_page = url + f"&page={page}"
            bs_object_sub = BeautifulSoup(requests.get(url_page, headers=headers).text, "html.parser")
            
            # find the unordered list (ul) elements which are the results, then
            # find all href (a) tags that are from the base_url website.
            index_links = bs_object_sub \
                .find(
                    "ul",
                    {"data-testid": "results"}
                ) \
                .findAll(
                    "a",
                    href=re.compile(f"{BASE_URL}/*") # the `*` denotes wildcard any
                )

            for link in index_links:
                # if its a property address, add it to the list
                if "address" in link["class"]:
                    url_links.append(link["href"])



postcode: 3000


In [103]:
# for each url, scrape some basic metadata
for property_url in url_links[1:]:
    bs_object = BeautifulSoup(requests.get(property_url, headers=headers).text, "html.parser")

    if bs_object \
        .find("h1", {"class": "css-164r41r"}) \
        .text == None:
        continue

    # looks for the header class to get property name
    property_metadata[property_url]["name"] = bs_object \
        .find("h1", {"class": "css-164r41r"}) \
        .text

    # looks for the div containing a summary title for cost
    property_metadata[property_url]["cost_text"] = bs_object \
        .find("div", {"data-testid": "listing-details__summary-title"}) \
        .text

    # looks for the div containing the number of bed, bathroom and parking area
    property_metadata[property_url]["features"] = bs_object \
        .find("div", {"data-testid": "property-features-wrapper"}) \
        .text

    # looks for the div containing the type of property
    property_metadata[property_url]["type"] = bs_object \
        .find("div", {"data-testid": "listing-summary-property-type"}) \
        .text
    
    # looks for the div containing the list of property features
    feature_list = bs_object \
        .find("div", {"id": "property-features"})
    
    # looks for the div containing the description of property
    property_desc = bs_object \
        .find("div", {"data-testid": "listing-details__description"})

    # check the existing of property information
    if feature_list != None and property_desc != None:
        information = feature_list.text + " " + property_desc.text
    elif feature_list != None:
        information = feature_list.text
    elif property_desc != None:
        information = property_desc.text
    
    # if do not have any information, set the facility in property equal to no
    else:
        information = "none"
        property_metadata[property_url]["furnitured"] = "No"
        property_metadata[property_url]["pool"] = "No"
        property_metadata[property_url]["gym"] = "No"

    if information != "none":
        
        # check whether have furniture
        if "unfurnished" in information.lower():
            property_metadata[property_url]["furnitured"] = "No"
        elif ("furnished" in information.lower()) or ("furnitured" in information.lower()):
            property_metadata[property_url]["furnitured"] = "Yes"
        else:
            property_metadata[property_url]["furnitured"] = "No"
        
        # check whether have pool
        if "pool" in information.lower():
            property_metadata[property_url]["pool"] = "Yes"
        else:
            property_metadata[property_url]["pool"] = "No"

        # check whether have gym
        if "gym" in information.lower():
            property_metadata[property_url]["gym"] = "Yes"
        else:
            property_metadata[property_url]["gym"] = "No"
    
    # extract coordinates from the hyperlink provided
    property_metadata[property_url]["coordinates"] = [
        float(coord) for coord in re.findall(
            r"destination=([-\s,\d\.]+)",
            bs_object \
                .find(
                    "a",
                    {"target": "_blank", "rel": "noopener noreferer"}
                ) \
                .attrs["href"]
        )[0].split(",")
    ]

    # get the description of properties
    property_metadata[property_url]["desc"] = re \
        .sub(r"<br\/>", "\n", str(bs_object.find("p"))) \
        .strip("</p >")

# output to property json in data/raw/
with open("../data/raw/property.json", "w") as f:
    dump(property_metadata, f)

# Scrape suburb data based on the postcode

In [None]:
# constant 
POSTCODE_BASE_URL = "https://auspost.com.au/postcode/"

# initialise the postcode dictionary
postcode_metadata = defaultdict(dict)

for postcode in unique_postcode:
    
    # only find the suburb for the postcode less than 4000
    if postcode >= 4000:
        break

    # get the url for each postcode
    url = f"{POSTCODE_BASE_URL}{postcode}"

    # give a 2 second delay
    time.sleep(2)

    # scrape the suburb data based on postcode
    bs_object = BeautifulSoup(requests.get(url, headers=headers).text, "html.parser")
    info = bs_object.findAll("td", {"class": "second"})

    # ignore the postcode with no suburbs
    if info == []:
        continue

    # find and store the suburbs
    suburbs = re.findall(">([\w\s]*),\s\w+", str(info))
    postcode_metadata[postcode] = suburbs

# output to postcode_match_suburb json in data/raw/
with open('../data/raw/postcode_match_suburb.json', 'w') as f:
    dump(postcode_metadata, f)