In [1]:
"""
A very simple and basic web scraping script. Feel free to
use this as a source of inspiration, but, make sure to attribute
it if you do so.

This is by no means production code.
"""
# built-in imports
import re
from json import dump

from collections import defaultdict

# user packages
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests

In [2]:
# constants
BASE_URL = "https://www.domain.com.au"
N_PAGES = range(1, 6) # update this to your liking

# begin code
url_links = []
property_metadata = defaultdict(dict)

# generate list of urls to visit
for page in N_PAGES:
    headers = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"}
    url = BASE_URL + f"/rent/melbourne-region-vic/?sort=price-desc&page={page}"
    bs_object = BeautifulSoup(requests.get(url, headers=headers).text, "html.parser")

    # find the unordered list (ul) elements which are the results, then
    # find all href (a) tags that are from the base_url website.
    index_links = bs_object \
        .find(
            "ul",
            {"data-testid": "results"}
        ) \
        .findAll(
            "a",
            href=re.compile(f"{BASE_URL}/*") # the `*` denotes wildcard any
        )

    for link in index_links:
        # if its a property address, add it to the list
        if 'address' in link['class']:
            url_links.append(link['href'])


In [3]:
url_links

['https://www.domain.com.au/667-glenhuntly-road-caulfield-vic-3162-11598047',
 'https://www.domain.com.au/upstairs-2c-staley-street-brunswick-vic-3056-13168913',
 'https://www.domain.com.au/level-3-302-13-15-lake-street-caroline-springs-vic-3023-15994395',
 'https://www.domain.com.au/3502-14-16-the-esplanade-st-kilda-vic-3182-16002767',
 'https://www.domain.com.au/9-lansdowne-street-blairgowrie-vic-3942-12127675',
 'https://www.domain.com.au/52-black-st-brighton-vic-3186-15410646',
 'https://www.domain.com.au/4203-35-spring-street-melbourne-vic-3000-15939303',
 'https://www.domain.com.au/335-the-esplanade-indented-head-vic-3223-12688424',
 'https://www.domain.com.au/50-south-wharf-drive-docklands-vic-3008-16048359',
 'https://www.domain.com.au/901-902-85-market-street-south-melbourne-vic-3205-14089455',
 'https://www.domain.com.au/9-keith-court-brighton-vic-3186-16058214',
 'https://www.domain.com.au/7-haverbrack-avenue-malvern-vic-3144-16041473',
 'https://www.domain.com.au/27-moffat-

In [4]:
# from htmldate import find_date
# for each url, scrape some basic metadata
for property_url in url_links[1:]:
    bs_object = BeautifulSoup(requests.get(property_url, headers=headers).text, "html.parser")

    # get the date this property was published
    # property_metadata[property_url]['date'] = find_date(property_url)

    # looks for the header class to get property name
    property_metadata[property_url]['name'] = bs_object \
        .find("h1", {"class": "css-164r41r"}) \
        .text

    # looks for the div containing a summary title for cost
    property_metadata[property_url]['cost_text'] = bs_object \
        .find("div", {"data-testid": "listing-details__summary-title"}) \
        .text

    # extract coordinates from the hyperlink provided
    # i'll let you figure out what this does :P
    property_metadata[property_url]['coordinates'] = [
        float(coord) for coord in re.findall(
            r'destination=([-\s,\d\.]+)', # use regex101.com here if you need to
            bs_object \
                .find(
                    "a",
                    {"target": "_blank", 'rel': "noopener noreferer"}
                ) \
                .attrs['href']
        )[0].split(',')
    ]

    #property_metadata[property_url]['rooms'] = [
    #    re.findall(r'\d\s[A-Za-z]+', feature.text)[0] for feature in bs_object \
    #        .find("div", {"data-testid": "property-features"}) \
    #        .findAll("span", {"data-testid": "property-features-text-container"})
    #]

    property_metadata[property_url]['desc'] = re \
        .sub(r'<br\/>', '\n', str(bs_object.find("p"))) \
        .strip('</p>')

# output to example json in data/raw/
with open('../data/raw/example.json', 'w') as f:
    dump(property_metadata, f)

ValueError: ("URL couldn't be processed: %s", 'https://www.domain.com.au/upstairs-2c-staley-street-brunswick-vic-3056-13168913')

### Convert json to csv style

In [None]:
import json
import csv
example_json = open('../data/raw/example.json')
data = json.load(example_json)
#for item in data:
#    print(item)
# print(data)
headers = ['name', 'cost_text', 'coordinates', 'rooms', 'desc']
output_csv = open('../data/raw/property.csv', 'w')
# create the csv writer object
csv_writer = csv.writer(output_csv)
# first write the headers
csv_writer.writerow(headers)

for link in data:
    csv_writer.writerow(data[link].values())

output_csv.close()
example_json.close()

{'https://www.domain.com.au/upstairs-2c-staley-street-brunswick-vic-3056-13168913': {'name': 'Upstairs 2C Staley Street Brunswick VIC 3056', 'cost_text': '$35,000 Annually', 'coordinates': [-37.7655919, 144.9633048], 'desc': '1st floor offices/studios\nEasy walking distance to Sydney Road and public transport\nIdeal offices, dance studio, fitness studio, art studio\nIncludes tearoom, toilets, 2 undercover car-spaces\nApprox 337sqm'}, 'https://www.domain.com.au/level-3-302-13-15-lake-street-caroline-springs-vic-3023-15994395': {'name': 'Level 3, 302/13-15 Lake Street Caroline Springs VIC 3023', 'cost_text': '4125000 pw', 'coordinates': [-37.7316459, 144.7446886], 'desc': '- Centrally located with tranquil views.\n- Lettable area from 110m2 approximately \n- partitioned throughout\n- Ample staff and client parking\n- Excellent main road exposure\n- Fully air-conditioned throughout (heating and cooling)\n- Major growth corridor location with enormous catchment\n- Rare opportunity to lease