In [132]:
# This notebook webscrapes property listings and their internal attributes from domain.com
# Note: to reproduce this code the variable 'YOUR_PATH_TO_CHROMEDRIVER' must be replaced with the path to chromedriver on user's device 

# Import Libraries 
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
import cchardet as chardet

In [133]:
# Set path for driver 

service = Service(executable_path="YOUR_PATH_TO_CHROMEDRIVER")
driver = webdriver.Chrome(service=service)

# Constants
BASE_URL = "https://www.domain.com.au"
N_PAGES = range(1, 51) 

# Create lists for storing scraped data 
property_links = []
property_type = []
addresses = []
suburbs = []


In [135]:
# Collect property listing data for Houses in VIC

for page in N_PAGES:

    # Parse through each page of website 
    url = BASE_URL + f"/rent/?ptype=house&sort=dateupdated-desc&state=vic&page={page}"
    driver.get(url) 
    content = driver.page_source
    soup = BeautifulSoup(content, features='html.parser')

    for element in soup.findAll('li', attrs={'class': 'css-1qp9106'}):
        link = element.find('a', href = True)

        # Retrieve link to property listing
        property_links.append(link['href'])
        property_type.append('House')
        
        # Retrieve suburb name 
        suburb = element.find('span', attrs={"data-testid":"address-line2", 'class' : 'css-iqrvhs'}) 
        suburbs.append(suburb.text)
        
        # Retrieve address 
        full_address = element.find('a', attrs={"class": "address is-two-lines css-1y2bib4"}) 
        addresses.append(full_address.text)


In [136]:
# Collect property listing data for Apartments in VIC

for page in N_PAGES:

    # Parse through each page of website 
    url = BASE_URL + f"/rent/?ptype=apartment&sort=dateupdated-desc&state=vic&page={page}"
    driver.get(url) 
    content = driver.page_source
    soup = BeautifulSoup(content, features='html.parser')

    for element in soup.findAll('li', attrs={'class': 'css-1qp9106'}):
        link = element.find('a', href = True)

        # Retrieve link to property listing
        property_links.append(link['href'])
        property_type.append('Apartment')
        
        # Retrieve suburb name 
        suburb = element.find('span', attrs={"data-testid":"address-line2", 'class' : 'css-iqrvhs'}) # Suburb and postcode only 
        suburbs.append(suburb.text)
        
        # Retrieve address 
        full_address = element.find('a', attrs={"class": "address is-two-lines css-1y2bib4"}) # Full address 
        addresses.append(full_address.text)

In [137]:
# Collect property listings data for Townhouses in VIC

for page in N_PAGES:

    # Parse through each page of website 
    url = BASE_URL + f"/rent/?ptype=town-house&sort=dateupdated-desc&state=vic&page={page}"
    driver.get(url) 
    content = driver.page_source
    soup = BeautifulSoup(content, features='html.parser')

    for element in soup.findAll('li', attrs={'class': 'css-1qp9106'}):
        link = element.find('a', href = True)

        # Retrieve link to property listin
        property_links.append(link['href'])
        property_type.append('Apartment')
        
        # Retrieve suburb name
        suburb = element.find('span', attrs={"data-testid":"address-line2", 'class' : 'css-iqrvhs'}) # Suburb and postcode only 
        suburbs.append(suburb.text)
        
        # Retrieve address 
        full_address = element.find('a', attrs={"class": "address is-two-lines css-1y2bib4"}) # Full address 
        addresses.append(full_address.text)

In [138]:
# Parse through each property link to extract internal attributes 

properties = [[]]
properties = [[link] for link in property_links] 

# Merge all current data 
for i in range(len(suburbs)): 
    properties[i].append(suburbs[i])
    properties[i].append(addresses[i])
    properties[i].append(property_type[i])

# Scrape internal attributes 
for i in range(len(properties)): 

    # Parse through each link
    link = properties[i][0] 
    driver.get(link) 
    content = driver.page_source
    soup = BeautifulSoup(content, features='html.parser') 

    for element in soup.findAll('div', attrs={'data-testid': 'listing-details__summary'}):

        # Retrieve rent per week 
        price = element.find('div', attrs={"data-testid":"listing-details__summary-title"})
        if price: 
            properties[i].append(price.text) 
        else: 
            properties[i].append('') 
    
    # Retrieve number of bathrooms, bedrooms and garages 
    attributes = []
    for element in soup.findAll('span', attrs={'data-testid': 'property-features-text-container'}):
        if element: 
            attributes.append(element.text)
        else: 
            attributes.append('') 
    if len(attributes) > 4: 
        properties[i].append(attributes[:3]) 
    else: 
        properties[i].append(attributes)
    
    # Retrieve proximity to the closest school
    schools = []
    for element in soup.findAll('li', attrs={'data-testid': 'fe-co-school-catchment-school'}):
        distance = element.find('div', attrs = {'data-testid': 'fe-co-school-catchment-schoolDistance'})
        schools.append(distance.text)
    
    properties[i].append(schools[0])

In [139]:
# Display and save raw dataset
df = pd.DataFrame(properties, columns = ['property_link', 'suburb', 'address', 'property_type', 'price', 'attributes', 'closest_school'])
display(df.head())
df.to_csv("../data/raw/Raw_Rental_Property_Data.csv")

Unnamed: 0,property_link,suburb,address,property_type,price,attributes,closest_school
0,https://www.domain.com.au/14-curringa-ct-churc...,CHURCHILL VIC 3842,"14 Curringa Ct, CHURCHILL VIC 3842",House,$320 per week,"[3 Beds, 1 Bath, − Parking]",1 km away
1,https://www.domain.com.au/31-suzanne-street-da...,DANDENONG VIC 3175,"31 Suzanne Street, DANDENONG VIC 3175",House,$465,"[3 Beds, 1 Bath, 1 Parking]",0.7 km away
2,https://www.domain.com.au/65-bruckner-drive-po...,POINT COOK VIC 3030,"65 Bruckner Drive, POINT COOK VIC 3030",House,$420 pw,"[4 Beds, 2 Baths, 2 Parking]",1.3 km away
3,https://www.domain.com.au/16-june-square-ringw...,RINGWOOD NORTH VIC 3134,"16 June Square, RINGWOOD NORTH VIC 3134",House,$620 per week,"[3 Beds, 2 Baths, 6 Parking]",0.5 km away
4,https://www.domain.com.au/1-76-mckean-street-b...,BOX HILL NORTH VIC 3129,"1/76 McKean Street, BOX HILL NORTH VIC 3129",House,$620 per week,"[4 Beds, 2 Baths, 1 Parking]",0.7 km away
