In [1]:
# useful external data links
# ABS household related:
# https://www.abs.gov.au/statistics/people/housing/housing-occupancy-and-costs/2019-20#data-download
# Oldlisting hitorical data:
# https://www.oldlistings.com.au/real-estate/VIC/Melbourne/3000/rent/780

In [2]:
import pandas as pd
import itertools 
from bs4 import BeautifulSoup
import requests
from requests import get
import time
from random import seed
from random import random
from random import randint
import re
from csv import writer

In [4]:
## DATA FORMATTING 
## initializing lists and variables

# Example of html
# <div class="property odd clearfix" data-lat="-37.8068451" data-lng="144.9608544"> 
# <section class="grid-100 grid-parent"> 
# <section class="grid-65 tablet-grid-65 clearfix"> 
# <h2 class="address">3808/27 THERRY STREET, MELBOURNE</h2> 
# <p class="property-meta bed"><span>Bed :</span> 3</p> 
# <p class="property-meta bath"><span>Bath :</span> 2</p> 
# <p class="property-meta car"><span>Car :</span> 1</p> 
# <p class="property-meta type"><span>Category :</span> Available Now</p> </section> 
# <section class="grid-35 tablet-grid-35 price"> <span>Last Advertised Price : August 2022</span> <
# h3>$1,050</h3> </section> 
# <section class="grid-100 historical-price"> 
# <span>Historical Prices: </span> 
# <ul> <li><span>August 2022</span>$1,050</li> 
# <li><span>February 2021</span>$790</li> 
# <li><span>March 2019</span>$1,100</li> 
# <li><span>April 2017</span>$1,050</li> 
# <li><span>March 2017</span>$1,050 per week</li> 
# <li><span>February 2016</span>$1,025 per week</li> 
# <li><span>November 2014</span>$1,000 per week</li> </ul> </section> </section> </div>

def html_phaser(houses):
    count = 0
    data = pd.DataFrame()
    first = True

    ## how long we are running the while loop for 
    n = int(len(houses)) - 1

    while count <= n:
        # running the loop through each html bin we scraped
        sections = houses[int(count)]
        section_list = sections.find_all('div', class_=['odd', 'even'])#{"class":"property odd clearfix"})
        for num in section_list:
            # getting the price: make sure to test this code a few times by itself to understand exactly which parameters will work 
            current_priceTag = num.find_all('section', {"class":"grid-35 tablet-grid-35 price"})
            priceSection = num.find_all('section',{"class":"grid-100 historical-price"})
            priceTags = []
            rent_prices = []
            rent_dates = []

            try:
                current_price = re.search(r'\<h3\>(.*)\<\/h3\>', str(current_priceTag))
                current_date = re.search(r'\:[ ](.*)\<\/span\>', str(current_priceTag))
                rent_prices.append(current_price.group(1))
                rent_dates.append(current_date.group(1))
            except:
                print("Current rent value exception")

            priceTags = priceSection[0].find_all('li')
            for ps in priceTags:
                try:
                    rent = re.search(r'\<\/span\>(.*)\<\/li\>', str(ps))
                    rent_prices.append(rent.group(1))
                    date = re.search(r'\<span\>(.*)\<\/span\>', str(ps))
                    rent_dates.append(date.group(1))
                except:
                    print("Historical values exception")

            try:
                lat = re.search(r'data\-lat\=\"(.*\d)\"[ ]', str(num)) 
                lng = re.search(r'data\-lng\=\"(.*\d)\"\>', str(num)) 
                latitude = lat.group(1)
                longitude = lng.group(1)
            except:
                print("Location values exception")

            try:
                nbed = num.find_all('p', {"class": "property-meta bed"})[0].text
                nbed = re.search(r'[A-Za-z][ ]:[ ](.*)', nbed).group(1)
            except IndexError:
                nbed = 'none'
            try:
                nbath = num.find_all('p', {"class": "property-meta bath"})[0].text
                nbath = re.search(r'[A-Za-z][ ]:[ ](.*)', nbath).group(1)
            except IndexError:
                nbath = 'none'
            try:
                ncar = num.find_all('p', {"class": "property-meta car"})[0].text
                ncar = re.search(r'[A-Za-z][ ]:[ ](.*)', ncar).group(1)
            except IndexError:
                ncar = 'none'
            try:
                address = num.find_all('h2', {"class":"address"})[0].text
            except IndexError:
                address = 'none'

            for z in zip(rent_prices, rent_dates):
                d = {"address":[address], "latitude":[latitude], "longitude":[longitude], 
                    "nbed":[nbed], "nbath":[nbath], "ncar":[ncar], "historical_prices":[z[0]], 
                    "historical_dates":[z[1]]}

                if first:
                    first = False
                    data = pd.DataFrame.from_dict(d)
                else:
                    data = pd.concat([data, pd.DataFrame.from_dict(d)])

        count += 1

    return data

    # print(data.head(10))
    # data.to_csv("../../data/curated/historical_rent_data.csv")
# concat all the different dataframes created, culminating in dfa (completed dataframe)
# result = pd.concat([df_price, df_agency], axis=1, sort=False)
# result2 = pd.concat([result, df_postcode], axis=1, sort=False)
# result3 = pd.concat([result2, df_bedrooms], axis=1, sort=False)
# dfa = pd.concat([result3, df_surface], axis=1, sort=False)

In [5]:
# dlink = 'https://www.oldlistings.com.au/real-estate/VIC/Melbourne/3000/rent/'
def retrive_hist(dlink, filename = "../../data/curated/historical_rent_data_all.csv"):
    # specify the url format
    # url = 'https://www.oldlistings.com.au/real-estate/VIC/Melbourne/3000/rent/' # put page number at the end
    # initialize a list called houses 
    houses = []
    # initialize variable count at 1
    count = 1

    # first while loop that will run 100 times (adjust this to how many pages you want to scrape)
    while count <= 10:
        # initialize variable new_count at 0
        new_count = 0
        houses = []
        # if loop that specifies the first page separately (many websites have a first page url format different than other pages)
        if count == 1:
            first_page = dlink
            # request the response
            response = get(first_page)
            # parse through the html 
            html_soup = BeautifulSoup(response.text, 'html.parser')
            # in the html of the page, find all the bins with <li> and class:
            house_data = html_soup.find_all('div', class_="content-col")
            # I like to print where the program is on the screen so we can follow its progress and where any errors happened
            print(first_page)

            # if the response was not empty (if something was actually scraped)
            if house_data != []:
                # add to the list houses
                houses.extend(house_data)
                # random wait times
                value = random()
                scaled_value = 1 + (value * (9 - 5))
                print(scaled_value)
                # time.sleep(scaled_value)
        # pages other than the first
        elif count != 1:
        # collect four and wait random times 
            url = dlink + str(count)
            print(url)
            response = get(url)
            html_soup = BeautifulSoup(response.text, 'html.parser')
            #print(response)
            house_data = html_soup.find_all('div', class_="content-col")

            if house_data != []:
                houses.extend(house_data)
                value = random()
                scaled_value = 1 + (value * (9 - 5))
                # print(scaled_value)
                # time.sleep(scaled_value)

            # if you get empty response, stop the loop
            else:
                print('empty')
                break

        data = html_phaser(houses)
        
        # write the relevant info of current property one row at a time
        with open(filename, 'a', newline='') as g:
            thewriter = writer(g)
            for row in data.iterrows():
                thewriter.writerow(row)
            
        count += 1

In [6]:
# table link = https://www.oldlistings.com.au/site-map?state=VIC&sort=asc&order=Postcode

tlink = "https://www.oldlistings.com.au/site-map?state=VIC&sort=asc&order=Postcode"
