# Scraping data

Please scrape the property data from here. Part of the code is written by the University of Melbourne lecturer and tutor.

In [2]:
import re
import pandas as pd
from json import dump
from collections import defaultdict
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests

In [3]:
postcode = pd.read_csv('../data/raw/Australian-Postcode-Data-master/au_postcodes.csv')

In [4]:
postcode['state_code'].value_counts()

NSW    5124
QLD    3456
VIC    3260
WA     1903
SA     1783
TAS     831
NT      364
ACT     154
Name: state_code, dtype: int64

In [5]:
code_vic = postcode[postcode['state_code'] == 'VIC']
code_vic = code_vic[postcode['accuracy'] == 4]
code_vic = code_vic.drop_duplicates(subset=['postcode'])
codes = code_vic['postcode'].tolist()
len(codes)

  code_vic = code_vic[postcode['accuracy'] == 4]


730

In [None]:
# constants
BASE_URL = "https://www.domain.com.au"
N_PAGES = range(1,10) # update this to your liking
headers = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"}
# begin code
property_metadata = defaultdict(dict)

for code in codes: 
    url_links = []
    print(code)
    
    # generate list of urls to visit
    for page in N_PAGES:
        url = BASE_URL + f"/rent/?postcode={code}&page={page}"
        #print(url)
        bs_object = BeautifulSoup(requests.get(url, headers=headers).text, "html.parser")

            
        # find the unordered list (ul) elements which are the results, then
        # find all href (a) tags that are from the base_url website.
        
        if bs_object.find("ul", {"data-testid": "results"}) is None:
            break
        
        index_links = bs_object \
            .find(
                "ul",
                {"data-testid": "results"}
            ) \
            .findAll(
                "a",
                href=re.compile(f"{BASE_URL}/*") # the `*` denotes wildcard any
            )

        for link in index_links:
            # if its a property address, add it to the list
            if 'address' in link['class']:
                url_links.append(link['href'])
    
    # for each url, scrape some basic metadata
    for property_url in url_links[1:]:
        #print(property_url)
        bs_object = BeautifulSoup(requests.get(property_url, headers=headers).text, "html.parser")

        # looks for the header class to get property name
        if bs_object.find("h1", {"class": "css-164r41r"}) is not None:
            property_metadata[property_url]['name'] = bs_object \
                .find("h1", {"class": "css-164r41r"}) \
                .text
        else:
            property_metadata[property_url]['name'] = None
            
        # looks for rent cost
        if bs_object.find("div", {"data-testid": "listing-details__summary-title"}) is not None:
            property_metadata[property_url]['cost_text'] = bs_object \
                .find("div", {"data-testid": "listing-details__summary-title"}) \
                .text
        else:
            property_metadata[property_url]['cost_text'] = None
        
        # looks for property type
        if bs_object.find("div", {"data-testid": "listing-summary-property-type"}) is not None:
            property_metadata[property_url]['type'] = bs_object \
                .find("div", {"data-testid": "listing-summary-property-type"}) \
                .text
        else:
            property_metadata[property_url]['type'] = None
            

        # looks for nearest school title and distance
        if bs_object.find("h5", {"data-testid": "fe-co-school-catchment-school-title"}) is not None:
            
            property_metadata[property_url]['school_title'] = bs_object \
                .find("h5", {"data-testid": "fe-co-school-catchment-school-title"}) \
                .text

            property_metadata[property_url]['school_distance'] = bs_object \
                .find("div", {"data-testid": "fe-co-school-catchment-schoolDistance"}) \
                .text

        else:
            property_metadata[property_url]['school_title'] = None
            property_metadata[property_url]['school_distance'] = None
            
            
        # looks for suburb info on domain
        if  bs_object.find("div", {"data-testid": "suburb-insights__data-point-value"}) is not None:
        
            property_metadata[property_url]['suburb_sold'] = bs_object \
                .findAll("div", {"data-testid": "suburb-insights__data-point-value"})[2]\
                .text

            property_metadata[property_url]['suburb_avg_day'] = bs_object \
                .findAll("div", {"data-testid": "suburb-insights__data-point-value"})[3]\
                .text

            property_metadata[property_url]['suburb_population'] = bs_object \
                .findAll("div", {"data-testid": "suburb-insights__data-point-value"})[4]\
                .text

            property_metadata[property_url]['suburb_age'] = bs_object \
                .findAll("div", {"data-testid": "suburb-insights__data-point-value"})[5] \
                .text

            property_metadata[property_url]['suburb_ower'] = bs_object \
                .findAll("span", {"data-testid": "left-value"})[-2] \
                .text

            property_metadata[property_url]['suburb_renter'] = bs_object \
                .findAll("span", {"data-testid": "right-value"})[-2] \
                .text
        
            property_metadata[property_url]['suburb_family'] = bs_object \
                .findAll("span", {"data-testid": "left-value"})[-1] \
                .text

            property_metadata[property_url]['suburb_single'] = bs_object \
                .findAll("span", {"data-testid": "right-value"})[-1] \
                .text
        else:
            property_metadata[property_url]['suburb_sold'] = None
            property_metadata[property_url]['suburb_avg_day'] = None
            property_metadata[property_url]['suburb_population'] = None
            property_metadata[property_url]['suburb_age'] = None
            property_metadata[property_url]['suburb_ower'] = None
            property_metadata[property_url]['suburb_renter'] = None
            property_metadata[property_url]['suburb_family'] = None
            property_metadata[property_url]['suburb_single'] = None
            
        # extract coordinates from the hyperlink provided
        # i'll let you figure out what this does :P
        property_metadata[property_url]['coordinates'] = [
            float(coord) for coord in re.findall(
                r'destination=([-\s,\d\.]+)',
                bs_object \
                    .find(
                        "a",
                        {"target": "_blank", 'rel': "noopener noreferer"}
                    ) \
                    .attrs['href']
            )[0].split(',')
        ]

        property_metadata[property_url]['rooms'] = [
            re.findall(r'\d\s[A-Za-z]+', feature.text) for feature in bs_object \
                .find("div", {"data-testid": "property-features"}) \
                .findAll("span", {"data-testid": "property-features-text-container"})
        ]

        property_metadata[property_url]['desc'] = re \
            .sub(r'<br\/>', '\n', str(bs_object.find("p"))) \
            .strip('</p>')
        
        property_metadata[property_url]['postcode'] = code

with open('../data/curated/domain.json', 'w') as f:
    dump(property_metadata, f)