In [1]:
"""
A very simple and basic web scraping script. Feel free to
use this as a source of inspiration, but, make sure to attribute
it if you do so.

This is by no means production code.
"""
# built-in imports
import re
from json import dump

from collections import defaultdict

# user packages
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests

# constants
BASE_URL = "https://www.domain.com.au"
N_PAGES = range(1, 2) # update this to your liking

headers = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"}

**Selenium**

In [55]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait

driver_location = "/usr/bin/chromedriver"
binary_location = "/usr/bin/google-chrome"

options = Options()
options.binary_location = binary_location

options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--window-size=1920,1080")

s=Service(driver_location)

driver = webdriver.Chrome(service=s, options=options)


In [82]:
# begin code
url_links = []
property_metadata = defaultdict(dict)

# generate list of urls to visit
for page in N_PAGES:
    # need to decide regions to analyse 
    url = BASE_URL + f"/rent/melbourne-region-vic/?sort=dateupdated-desc&page={page}" # a single page
    bs_object = BeautifulSoup(requests.get(url, headers=headers).text, "html.parser") # makes bs object

    # find the unordered list (ul) elements which are the results, then
    # find all href (a) tags that are from the base_url website.
    index_links = bs_object \
        .find(
            "ul",
            {"data-testid": "results"}
        ) \
        .findAll(
            "a",
            href=re.compile(f"{BASE_URL}/*") # complies RE string into RE expression 
        )

    for link in index_links:
        # if its a property address, add it to the list
        if 'address' in link['class']:
            url_links.append(link['href'])

print(url_links[0:5])

['https://www.domain.com.au/7-timmins-court-mill-park-vic-3082-16076192', 'https://www.domain.com.au/36-duosa-road-altona-north-vic-3025-15410999', 'https://www.domain.com.au/5-1-st-georges-road-toorak-vic-3142-16076186', 'https://www.domain.com.au/2-602-highbury-road-glen-waverley-vic-3150-16076184', 'https://www.domain.com.au/1-22-maivary-lane-northcote-vic-3070-16076163']


In [83]:
test_url = ['https://www.domain.com.au/12-clendon-court-toorak-vic-3142-14238687']
# for each url, scrape some basic metadata
for property_url in url_links[0:5]:
    bs_object = BeautifulSoup(requests.get(property_url, headers=headers).text, "html.parser")

    # looks for the header class to get property name
    property_metadata[property_url]['Name'] = bs_object \
        .find("h1", {"class": "css-164r41r"}) \
        .text

    # regex to find the cost in the summary title 
    cost_finder = re.compile(r'[0-9]+.?[0-9]+') # this regex search assumes that the first numeric value is the cost per week 
    # looks for the div containing a summary title for cost
    cost_text = bs_object \
        .find("div", {"data-testid": "listing-details__summary-title"}) \
        .text

    # extracts the cost from the summary title and adds to dictionary. 
    # if there is no cost written in the summary title, it is replaced by 0 
    cost = cost_finder.search(cost_text)
    if cost == None: 
        property_metadata[property_url]['Cost'] = 0
    else:
         property_metadata[property_url]['Cost'] = cost.group()
        
    # extract coordinates from the hyperlink provided
    # finds latitude and longitude from integrated Google Map
    property_metadata[property_url]['Coordinates'] = [
        float(coord) for coord in re.findall(
            r'destination=([-\s,\d\.]+)', # use regex101.com here if you need to
            bs_object \
                .find(
                    "a",
                    {"target": "_blank", 'rel': "noopener noreferer"}
                ) \
                .attrs['href']
        )[0].split(',')
    ]
    
    # extracts # of bedrooms, # of baths and # of parking spots 
    rooms_info = bs_object.find("div", {"data-testid": "property-features"}).findAll("span", {"data-testid": "property-features-text-container"})
    for i in range(0, len(rooms_info)):
        attr_desc = str(rooms_info[i].text).split(' ')

        property_metadata[property_url][attr_desc[1]] = attr_desc[0]

    # extracts property type from the site 
    property_metadata[property_url]['Property_Type'] = bs_object \
        .find("div", {"data-testid": "listing-summary-property-type"}).find("span", {"class" : "css-in3yi3"}).text

    # extracts desciption from the site 
    # will significantly increase run-time
    '''
    driver.get(property_url)
    read_more_button = driver.find_element(by=By.CSS_SELECTOR , value='[data-testid="listing-details__description-button"]')
    read_more_button.click()
    property_metadata[property_url]['Desc'] = driver.find_element(by=By.CSS_SELECTOR, value='[data-testid="listing-details__description"]').text
    '''

    # extract real estate agency
    property_metadata[property_url]['Agency'] = bs_object.find("a", {"data-testid" : "listing-details__agent-details-agent-company-name"}).text
     

In [84]:
property_metadata

defaultdict(dict,
            {'https://www.domain.com.au/7-timmins-court-mill-park-vic-3082-16076192': {'Name': '7 Timmins Court Mill Park VIC 3082',
              'Cost': '600',
              'Coordinates': [-37.6584651, 145.0470335],
              'Beds': '4',
              'Baths': '2',
              'Parking': '4',
              'Property_Type': 'House',
              'Agency': 'Ray White Mill Park'},
             'https://www.domain.com.au/36-duosa-road-altona-north-vic-3025-15410999': {'Name': '36 Duosa Road Altona North VIC 3025',
              'Cost': '490',
              'Coordinates': [-37.8264754, 144.844517],
              'Beds': '3',
              'Bath': '1',
              'Parking': '2',
              'Property_Type': 'House',
              'Agency': 'Williams Real Estate'},
             'https://www.domain.com.au/5-1-st-georges-road-toorak-vic-3142-16076186': {'Name': '5/1 St Georges Road Toorak VIC 3142',
              'Cost': '280.00',
              'Coordinates': [

In [28]:
# output to example json in data/raw/
with open('../data/raw/example.json', 'w') as f:
    dump(property_metadata, f)

