In [1]:
"""
A very simple and basic web scraping script. Feel free to
use this as a source of inspiration, but, make sure to attribute
it if you do so.

This is by no means production code.
"""
# built-in imports
import re
import pandas as pd
from json import dump

from collections import defaultdict

# user packages
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests

# constants
BASE_URL = "https://www.domain.com.au"
N_PAGES = range(1, 2) # update this to your liking

headers = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"}



<h3>Web Scraping</h3>

In [24]:
postcodes = pd.read_csv("../data/curated/unique_postcodes.csv", header=None).squeeze()
postcodes

0      3000
1      3001
2      3002
3      3003
4      3004
       ... 
711    3990
712    3991
713    3992
714    3995
715    3996
Name: 0, Length: 716, dtype: int64

**Progress tracker**

Imports & functions for progress tracking, can be removed for final submission

In [3]:
from IPython.display import clear_output 
import timeit
import numpy as np

In [4]:
def progress_tracker(start, stop, curr_progress):
    if (curr_progress*100) < 5:
        expected_time = "Calculating..."
    else:
        time_perc = timeit.default_timer()
        expected_time = np.round(((time_perc-start) / curr_progress)/60, 2)

    print("Current progress:", np.round(curr_progress*100, 2), "%")
    print("current run time:", np.round((stop-start)/60, 2), "minutes")
    print("Expected run time:", expected_time, "minutes")

**Selenium (optional running)**

In [55]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait

driver_location = "/usr/bin/chromedriver"
binary_location = "/usr/bin/google-chrome"

options = Options()
options.binary_location = binary_location

options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--window-size=1920,1080")

s=Service(driver_location)

driver = webdriver.Chrome(service=s, options=options)


**URL Scraping**

In [25]:
# begin code
url_dict = defaultdict(dict)
property_metadata = defaultdict(dict)

start = timeit.default_timer() # for progress tracking

# index & enumerate are for progress tracking, can be removed for final submission
for index, postcode in enumerate(postcodes):
    clear_output(wait=True) # for progress tracking

    url_links = []
    
    # generate list of urls to visit
    for page in N_PAGES:
        # need to decide regions to analyse         
        url = BASE_URL + f"/rent/?postcode={postcode}&page={page}" # a single page
        bs_object = BeautifulSoup(requests.get(url, headers=headers).text, "html.parser") # makes bs object

        # find the unordered list (ul) elements which are the results, then
        # find all href (a) tags that are from the base_url website.
        index_links = bs_object.find("ul", {"data-testid": "results"}) 

        # if there are no links, break
        if (index_links == None):
            break 

        index_links = index_links.findAll("a",href=re.compile(f"{BASE_URL}/*")) # complies RE string into RE expression
            
        for link in index_links:
            # if its a property address, add it to the list
            if 'address' in link['class']:
                url_links.append(link['href'])

    url_dict[postcode] = url_links

    # for progress tracking
    curr_progress = index/len(postcodes)
    stop = timeit.default_timer()
    progress_tracker(start, stop, curr_progress)



Current progress: 99.86 %
current run time: 5.49 minutes
Expected run time: 5.5 minutes


**Feature Scraping**

In [27]:
start = timeit.default_timer() # for progress tracking

# for each url, scrape features
# index & enumerate are for progress tracking, can be removed for final submission
for index, postcode in enumerate(url_dict):
    clear_output(wait=True) # for progress tracking

    url_links = url_dict.get(postcode)

    for property_url in url_links:
        bs_object = BeautifulSoup(requests.get(property_url, headers=headers).text, "html.parser")

        # looks for the header class to get property name
        property_metadata[property_url]['Name'] = bs_object \
            .find("h1", {"class": "css-164r41r"}) \
            .text

        # regex to find the cost in the summary title 
        cost_finder = re.compile(r'[0-9]+.?[0-9]+') # this regex search assumes that the first numeric value is the cost per week 
        # looks for the div containing a summary title for cost
        cost_text = bs_object \
            .find("div", {"data-testid": "listing-details__summary-title"}) \
            .text

        # extracts the cost from the summary title and adds to dictionary. 
        # if there is no cost written in the summary title, it is replaced by 0 
        cost = cost_finder.search(cost_text)
        if cost == None: 
            property_metadata[property_url]['Cost'] = 0
        else:
            property_metadata[property_url]['Cost'] = cost.group()
            
        # extract coordinates from the hyperlink provided
        # finds latitude and longitude from integrated Google Map
        property_metadata[property_url]['Coordinates'] = [
            float(coord) for coord in re.findall(
                r'destination=([-\s,\d\.]+)', # use regex101.com here if you need to
                bs_object \
                    .find(
                        "a",
                        {"target": "_blank", 'rel': "noopener noreferer"}
                    ) \
                    .attrs['href']
            )[0].split(',')
        ]
        
        # extracts # of bedrooms, # of baths and # of parking spots 
        rooms_info = bs_object.find("div", {"data-testid": "property-features"}).findAll("span", {"data-testid": "property-features-text-container"})
        for i in range(0, len(rooms_info)):
            attr_desc = str(rooms_info[i].text).split(' ')

            property_metadata[property_url][attr_desc[1]] = attr_desc[0]

        # extracts property type from the site 
        property_metadata[property_url]['Property_Type'] = bs_object \
            .find("div", {"data-testid": "listing-summary-property-type"}).find("span", {"class" : "css-in3yi3"}).text

        # extracts desciption from the site 
        # will significantly increase run-time
        '''
        driver.get(property_url)
        read_more_button = driver.find_element(by=By.CSS_SELECTOR , value='[data-testid="listing-details__description-button"]')
        read_more_button.click()
        property_metadata[property_url]['Desc'] = driver.find_element(by=By.CSS_SELECTOR, value='[data-testid="listing-details__description"]').text
        '''

        # extract real estate agency
        property_metadata[property_url]['Agency'] = bs_object.find("a", {"data-testid" : "listing-details__agent-details-agent-company-name"}).text

        # add postcode
        property_metadata[property_url]['Postcode'] = postcode

    # for progress tracking
    curr_progress = index/len(url_dict)
    stop = timeit.default_timer()
    progress_tracker(start, stop, curr_progress)
    
        

KeyboardInterrupt: 

**Data output**

In [50]:
data = []
property_df = pd.DataFrame(columns=['Name', 'Cost', 'Coordinates', 'Bed', 'Bath', 'Parking', 'Agency', 'Postcode'])

for index, postcode in enumerate(postcodes):
    for url in url_dict.get(postcode):
        info = property_metadata.get(url)

        if info == None:
            break
        else:
            data.append(list(info.values()))
        


property_df = pd.DataFrame(data)
property_df.to_csv('../data/raw/property_data.csv')
       
        