In [17]:
"""
A very simple and basic web scraping script. Feel free to
use this as a source of inspiration, but, make sure to attribute
it if you do so.

This is by no means production code.
"""
# built-in imports
import re
from json import dump

from collections import defaultdict

# user packages
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests

# constants
BASE_URL = "https://www.domain.com.au"
N_PAGES = range(1, 2) # update this to your liking

headers = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"}

In [18]:
# begin code
url_links = []
property_metadata = defaultdict(dict)

# generate list of urls to visit
for page in N_PAGES:
    url = BASE_URL + f"/rent/melbourne-region-vic/?sort=dateupdated-desc&page={page}" # a single page
    bs_object = BeautifulSoup(requests.get(url, headers=headers).text, "html.parser") # makes bs object

    # find the unordered list (ul) elements which are the results, then
    # find all href (a) tags that are from the base_url website.
    index_links = bs_object \
        .find(
            "ul",
            {"data-testid": "results"}
        ) \
        .findAll(
            "a",
            href=re.compile(f"{BASE_URL}/*") # complies RE string into RE expression 
        )

    for link in index_links:
        # if its a property address, add it to the list
        if 'address' in link['class']:
            url_links.append(link['href'])

print(url_links[0:5])

['https://www.domain.com.au/2-73-boyd-street-dandenong-north-vic-3175-16068538', 'https://www.domain.com.au/4-1-leila-road-ormond-vic-3204-16068536', 'https://www.domain.com.au/29-normanby-street-moonee-ponds-vic-3039-16068534', 'https://www.domain.com.au/95-darebin-boulevard-reservoir-vic-3073-16068533', 'https://www.domain.com.au/101-439-bay-street-brighton-vic-3186-16068532']


In [21]:
# for each url, scrape some basic metadata
for property_url in url_links[0:5]:
    bs_object = BeautifulSoup(requests.get(property_url, headers=headers).text, "html.parser")

    # looks for the header class to get property name
    property_metadata[property_url]['Name'] = bs_object \
        .find("h1", {"class": "css-164r41r"}) \
        .text

    # regex to find the cost in the summary title 
    cost_finder = re.compile(r'[0-9]+.?[0-9]+') # this regex search assumes that the first numeric value is the cost per week 
    # looks for the div containing a summary title for cost
    cost_text = bs_object \
        .find("div", {"data-testid": "listing-details__summary-title"}) \
        .text

    # extracts the cost from the summary title and adds to dictionary. 
    # if there is no cost written in the summary title, it is replaced by 0 
    cost = cost_finder.search(cost_text)
    if cost == None: 
        property_metadata[property_url]['Cost'] = 0
    else:
         property_metadata[property_url]['Cost'] = cost.group()
        
    # extract coordinates from the hyperlink provided
    # finds latitude and longitude from integrated Google Map
    property_metadata[property_url]['Coordinates'] = [
        float(coord) for coord in re.findall(
            r'destination=([-\s,\d\.]+)', # use regex101.com here if you need to
            bs_object \
                .find(
                    "a",
                    {"target": "_blank", 'rel': "noopener noreferer"}
                ) \
                .attrs['href']
        )[0].split(',')
    ]
    
    # extracts # of bedrooms, # of baths and # of parking spots 
    rooms_info = bs_object.find("div", {"data-testid": "property-features"}).findAll("span", {"data-testid": "property-features-text-container"})
    for i in range(0, len(rooms_info)):
        attr_desc = str(rooms_info[i].text).split(' ')

        property_metadata[property_url][attr_desc[1]] = attr_desc[0]

    # extracts property type from the site 
    property_metadata[property_url]['Property_Type'] = bs_object \
        .find("div", {"data-testid": "listing-summary-property-type"}).find("span", {"class" : "css-in3yi3"}).text

    # extracts desciption from the site -> perhaps can use bag of words to find words that link to more expensive rental units 
    property_metadata[property_url]['Desc'] = re \
        .sub(r'<br\/>', '\n', str(bs_object.find("p"))) \
        .strip('</p>')

    
    

In [22]:
property_metadata

defaultdict(dict,
            {'https://www.domain.com.au/2-73-boyd-street-dandenong-north-vic-3175-16068538': {'name': '2/73 Boyd Street Dandenong North VIC 3175',
              'cost': '380',
              'coordinates': [-37.9646501, 145.214852],
              'Beds': '2',
              'Bath': '1',
              'Parking': '1',
              'property_type': 'Apartment / Unit / Flat',
              'desc': 'This 2 bedroom unit comprises an open plan living, meals and kitchen area, single lock up garage with remote access and all modern appliances, gas ducted heating and air conditioning. With only 2 units on the block this property offers quiet living and is within walking distance to local shops, schools and public transport services as well as providing easy access to Monash Freeway.',
              'Name': '2/73 Boyd Street Dandenong North VIC 3175',
              'Cost': '380',
              'Coordinates': [-37.9646501, 145.214852],
              'Property_Type': 'Apartment / U

In [5]:
# output to example json in data/raw/
with open('../data/raw/example.json', 'w') as f:
    dump(property_metadata, f)

