# Scraping from domain.com.au
Anthony He 1133985

This part is adapted from the sample code provided

In [1]:
# import packages (sorted alphabetically)
!pip install tqdm # please skip this line if it is already installed
from bs4 import BeautifulSoup
from collections import defaultdict
from random import random
from time import sleep
from tqdm import tqdm
import csv
import json
import os
import pandas as pd
import re
import requests



In [2]:
# all files will be stored in the property_meta folder
property_files = '../data/raw/property_meta'
if not os.path.exists(property_files):
    os.makedirs(property_files)

In [3]:
# read the scraped url
# if the url has been scraped, then it will not be scraped again
scraped_url = []
if os.path.exists(f'{property_files}/property_url.csv'):
    with open(f'{property_files}/property_url.csv', newline='') as inputfile:
        for row in csv.reader(inputfile):
            scraped_url.append(row[0])

In [4]:
# set the header of the soup
headers = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppelWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"}
# constants
BASE_URL = "https://www.domain.com.au"
sort_methods = ["default-desc", "dateupdated-desc", "price-asc", "price-desc", "suburb-asc"]
N_PAGES = range(1, 51) # update this to your liking

In [5]:
# initialise varaiables
url_links = []
property_metadata = defaultdict(dict)

In [8]:
# gather all links that should be scraped
for sort_method in tqdm(sort_methods):
    for page in N_PAGES:
        url = BASE_URL + f"/rent/melbourne-region-vic/?sort={sort_method}&page={page}"
        bs_object = BeautifulSoup(requests.get(url, headers = headers).text, "html.parser")
        # find the unordered list (ul) elements which are the results, then
        # find all href (a) tags that are from the base_url website.
        index_links = bs_object.find("ul",{"data-testid": "results"}).findAll("a",href=re.compile(f"{BASE_URL}/*"))
        for link in index_links:
            # if its a property address, add it to the list
            if 'address' in link['class']:
                url_links.append(link['href'])
        sleep(round(random(),2))

100%|██████████| 5/5 [02:48<00:00, 33.60s/it]


In [9]:
# check the number of urls
len(url_links)

5323

In [10]:
# only retains the ones that were not scraped
temp = []
for i in url_links:
    if not i in scraped_url:
        temp.append(i)
url_links = temp
# check the number of urls that is going to be scraped in this run
len(url_links)

870

In [11]:
url_links = list(set(url_links))
num_url = len(url_links)

In [12]:
# urls are stored in a csv file
# as property information are updated real time and scraped over a week
# future scraping should be compared to this to avoid duplication
with open(f'{property_files}/property_url.csv', 'w') as csvfile:
    csvwriter = csv.writer(csvfile)
    for x in url_links + scraped_url:
        csvwriter.writerow([x])

In [13]:
# for each url, scrape some basic metadata
# this segment of code may need to be modified if domain.com.au makes any changes
# this code is valid at the time of 12 September
for property_url in tqdm(url_links):
    bs_object = BeautifulSoup(requests.get(property_url, headers = headers).text, "html.parser")
    # looks for the header class to get property name
    property_metadata[property_url]['name'] = bs_object.find("h1", {"class": "css-164r41r"}).text
    property_metadata[property_url]['type'] = bs_object.find("div", {"data-testid": "listing-summary-property-type"}).text
    # looks for the div containing a summary title for cost
    property_metadata[property_url]['cost_text'] = bs_object.find("div", {"data-testid": "listing-details__summary-title"}).text
    # extract coordinates from the hyperlink provided
    property_metadata[property_url]['coordinates'] = [
        float(coord) for coord in re.findall(
            r'destination=([-\s,\d\.]+)', # use regex101.com here if you need to
            bs_object
                .find(
                "a",
                {"target": "_blank", 'rel': "noopener noreferer"}
            )
                .attrs['href']
        )[0].split(',')
    ]
    property_metadata[property_url]['rooms'] = [
        re.findall(r'\d\s[A-Za-z]+', feature.text) for feature in bs_object
            .find("div", {"data-testid": "property-features"})
            .findAll("span", {"data-testid": "property-features-text-container"})
    ]
    property_metadata[property_url]['desc'] = re.sub(r'<br\/>', '\n', str(bs_object.find("p"))).strip('</p>')
    sleep(round(3*random(),2))

100%|██████████| 539/539 [04:29<00:00,  2.00it/s]


Save the results of scraping

In [14]:
# merge json file with the data scraped this time
if os.path.exists(f'{property_files}/property_metadata.json'):
    with open(f'{property_files}/property_metadata.json', 'r') as f:
        data = json.load(f)
    d = defaultdict(list, data)
    for key, value in d.items():
        for subkey, subvalue in value.items():
            property_metadata[key][subkey] = subvalue

In [15]:
# write the new json file
with open(f'{property_files}/property_metadata.json', 'w') as f:
    json.dump(property_metadata, f)

In [16]:
# merge csv file with the data scraped this time
if os.path.exists(f'{property_files}/property_metadata.csv'):
    df = pd.read_csv(f'{property_files}/property_metadata.csv')

In [17]:
# write the current scraped metadata into a Pandas dataframe to save them into a csv file
df2 = pd.DataFrame(property_metadata).T.reset_index()
df2 = df2.rename(columns = {'index':'url'})

In [18]:
if os.path.exists(f'{property_files}/property_metadata.csv'):
    save_csv = pd.concat([df, df2])
else:
    save_csv = df2

In [19]:
save_csv.to_csv(f'{property_files}/property_metadata.csv', index = False)