In [135]:
#Import relevant Python libraries
import requests
from bs4 import BeautifulSoup
import re

from random_user_agent.user_agent import UserAgent
from random_user_agent.params import SoftwareName, OperatingSystem

import time
from random import random
from datetime import datetime
import os

import pandas as pd
import numpy as np

from pprint import pprint

from IPython.display import clear_output

import pickle

In [136]:
#Function to generate a randomised user agent for each connection 
def agent_randomiser():
    software_names = [SoftwareName.CHROME.value]
    operating_systems = [OperatingSystem.WINDOWS.value, OperatingSystem.LINUX.value]   

    user_agent_rotator = UserAgent(software_names=software_names, operating_systems=operating_systems, limit=100)

    # Get list of user agents.
    user_agents = user_agent_rotator.get_user_agents()

    # Get Random User Agent String.
    user_agent = user_agent_rotator.get_random_user_agent()
    
    user_agent2 = {"User-Agent": user_agent}
    
    return user_agent2

In [137]:
#Specify suburb; note if multiple suburbs are needed then currently need to run each independently 
#ensure that each suburb, state and postcode align correctly. 
suburbs = ['docklands']
state = ['vic']
postcode = ['3008']

# create a list of root urls of suburbs to scrape
if len(suburbs) == 1:   
    root_url =  'https://www.domain.com.au/sold-listings/%s-%s-%s/?excludepricewithheld=1&ssubs=0' %(suburbs[0],state[0],postcode[0])
else:
    root_url = []
    for i,suburb in enumerate(suburbs):
        root_url.append('https://www.domain.com.au/sold-listings/%s-%s-%s/?excludepricewithheld=1&ssubs=0' %(suburb,state[i],postcode[i]))

In [138]:
#Obtains information regarding what to extract. 
search={}

res = requests.get(root_url, headers=agent_randomiser())

if res.status_code == 200:
    print("Response from website:",res.status_code)

    soup_the_page = BeautifulSoup(res.content,features='lxml')
    properties_total_return = soup_the_page.find_all('h1',{'data-testid': 'summary'})
    total_properties = properties_total_return[0].text.split()[0]

    #This rounds up the number of URLs to srape from the base url
    total_pages = -(-(int(total_properties))//20)
    print("Pages to Search:",total_pages)

    #This is where the extracted information is appended to the search dictionary for use in later processing. 
    search.update({suburbs[0]: {'Suburb': suburbs[0],
                                'Total properties':total_properties,
                                'Pages to Search':total_pages,
                                'Root Url':root_url,
                                'Website Response': res.status_code}})

    print("Appended to the Search dictionary")
else:
    print("Response from website:",res.status_code)
    print("Something went wrong.")
    
    search.update({suburbs[0]: {'Suburb': 'UNABLE TO EXTRACT',
                                'Total properties':'UNABLE TO EXTRACT',
                                'Pages to Search':'UNABLE TO EXTRACT',
                                'Root Url':root_url,
                                'Website Response': res.status_code}})

print('\n',search)

Response from website: 200
Pages to Search: 74
Appended to the Search dictionary

 {'docklands': {'Suburb': 'docklands', 'Total properties': '1479', 'Pages to Search': 74, 'Root Url': 'https://www.domain.com.au/sold-listings/docklands-vic-3008/?excludepricewithheld=1&ssubs=0', 'Website Response': 200}}


In [139]:
#This creates the list of specific properties to iterate through

for k,value in search.items():
#     print(k,value)
    web_url_iterations = []
    #This get the root URL and drops the final character which is a 0 for the page
    url = value.get('Root Url') + str('&page=') #[:-1]
    
    if value.get('Pages to Search') > 50:
        web_url_iterations.append(value.get('Root Url'))
        #Note that the maximum amount of pages that Domain allow to view for sold is 50
        #THIS HAS BEEN SET TO 50 to prevent errors. For testing, recommend set to five to speed up extraction and check information
        iterations = 5                             
        for i in range(1,iterations):
            web_url_iterations.append(str(url + str((i+1))))
            value.update({'URL Search Page List': web_url_iterations})
            
    elif value.get('Pages to Search') == 1:
        web_url_iterations.append(value.get('Root Url'))
    
    elif value.get('Pages to Search') == 'UNABLE TO EXTRACT':
        print('Was unable to extract a root url. Check website response')
        value.update({'URL Search Page List': 'Was unable to extract a root url. Check website response'})
    
    else:
        web_url_iterations.append(value.get('Root Url'))
        iterations = value.get('Pages to Search')
        for i in range(1,iterations):
            web_url_iterations.append(str(url + str((i+1))))
            value.update({'URL Search Page List': web_url_iterations})


In [140]:
for k,value in search.items():
    search_url = value.get('URL Search Page List')
    unique_property_url = []
    
    for url in search_url:
        clear_output(wait=True)
        print('------------')
        print('Attempting to extract unique property URLS from:', url)
        
        time.sleep(random()*4)
        
        content = requests.get(url, headers=agent_randomiser())
        print("Response from website:",content.status_code)
        
        if content.status_code == 200:
            property_pane = BeautifulSoup(content.content,features='lxml')
            unique_links = property_pane.find_all('link',{'itemprop':'url'})
            for a in unique_links:
                unique_property_url.append(str(a).split('"')[1])
            value.update({'Specific property URLs': unique_property_url})
        else:
            print('An issue with the website occured. Please check website status code')


------------
Attempting to extract unique property URLS from: https://www.domain.com.au/sold-listings/docklands-vic-3008/?excludepricewithheld=1&ssubs=0&page=5
Response from website: 200


In [141]:
unique_property_brochure = []

for k,value in search.items():
    list_values = value.get('Specific property URLs')
    
    for pid in list_values:
        unique_id = pid.split("-")[-1]
        brochure_url =  'https://www.domain.com.au/Public/PropertyBrochure.aspx?adid=%s&mode=sold' %(unique_id)
        unique_property_brochure.append(brochure_url)
#         print(brochure_url)
        
    value.update({'Brochure Links':unique_property_brochure})

In [142]:
all_properties=[]

counter = 0

for k,value in search.items():
    print('\nGetting information for',str(value.get('Suburb')).upper())
    print('---------------')
    property_urls_suburb = value.get('Brochure Links')
    
    property_original_urls = value.get('Specific property URLs')
#     print('List of property links for', str(value.get('Suburb')).upper())
#     pprint(property_urls_suburb)
    
    for p,i in enumerate(property_urls_suburb):
#         print(i)
        time.sleep(random()*5)
        clear_output(wait=True)
        property_info = {}
        
        print('Connecting to ', i)
        print(p+1,'of',len(property_urls_suburb))
        web_response3 = requests.get(i, headers=agent_randomiser())
        print("Response from website:",web_response3.status_code)
        if web_response3.status_code == 200:
            print("Successfully extracted information")
            property_extract = BeautifulSoup(web_response3.content,features='lxml')                    
            property_summary_pane = property_extract.find_all('div',{'style':'margin: 0 10px 10px'})
            
            #Search date for appending
            search_date_time = datetime.now()
            property_info['search_date'] = search_date_time.strftime("%d %b %y %T")
            
            #Original url
            property_info['url'] = i
            
            #Property address
            address = property_extract.find('h2').text.split("\r")[0]
            property_info['address'] = address
            
            property_info['state'] = state[0]
            
            #Property suburb
            property_info['suburb'] = k
            
            #Property sale value
            try:
                sold_price = int(property_extract.find('h1').text.split("$")[1].replace(",",""))
                property_info['sold_price'] = sold_price
            except:
                property_info['sold_price'] = 'unknown'
            
            #Extract beds, baths, carspaces, and Land Area
            appending_info = []            
            
            strong_list = []

            for strong_tag in property_summary_pane[0].find_all('strong'):
            #     print(strong_tag.next_sibling.split())
                strong_list.append(strong_tag.next_sibling.split())

            for x in strong_list:
                if len(x) == 1:
                    pass
                elif len(x)>1:
                    property_info['bedrooms'] = 'unknown'
                    property_info['bathrooms'] = 'unknown'
                    property_info['carspaces'] = 'unknown'
                    for i,o in enumerate(x):
                        if o == 'bedrooms' or o == 'bedroom':
                            property_info['bedrooms'] = int(x[i-1])
                        if o == 'bathrooms' or o == 'bathroom':
                            property_info['bathrooms'] = int(x[i-1])
                        if o == 'carspace' or o == 'carspaces':
                            property_info['carspaces'] = int(x[i-1])  
    
            
            for st in property_extract.find_all('strong'):
                property_info['land area'] = 'unknown'
                if st.text == "Land area:":
                    property_info['land area'] = float(st.next_sibling.split()[0])
#                     print(float(st.next_sibling.split()[0]))

            #This gets the description for the property from the brochure page
            
            try:
                h3_list = []

                for h3_tag in property_extract.find_all('h3',{'class':'agentheading'}):
                    h3_list.append(h3_tag.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling)

                for h3 in h3_list:
                    if len(str(h3).split()) > 30:
                        property_description = str(h3_list[2]).replace("<p>","").replace("\r","").replace("\n","").replace("\t","").replace("</p>","")
                        property_info['description'] = property_description
            except:
                property_info['description'] = 'No description extracted'
            
            all_properties.append(property_info)
            print('****')
        else:
            print("!!!!!!!!!!!!!!!!!!!!!") #CHANGE THIS TO INSERT UNKNOWN INFORMATION FOR A URL
            print("Something went wrong!")
            print("!!!!!!!!!!!!!!!!!!!!!")

#Sale date need to come from the original page. This collects this information            
print('-------------------------------------')
print('NOW COLLLECTING SALE DATE INFORMATION')
print('-------------------------------------')

for pid,property_id in enumerate(all_properties):
        
    time.sleep(random()*5)
    property_info_2 = {}
    
    clear_output(wait=True)
    print(pid+1,'of',len(all_properties))
    
    web_response4 = requests.get(search[suburbs[0]]['Specific property URLs'][pid], headers=agent_randomiser())
    print("Contacting", search[suburbs[0]]['Specific property URLs'][pid])
    print("Response from website:",web_response4.status_code)
    
    if web_response4.status_code == 200:
        property_extract_2 = BeautifulSoup(web_response4.content,features='lxml')                    
        property_summary_pane_2 = property_extract_2.find_all('span',{'class':'listing-details__listing-tag is-sold listing-details__summary-tag'})
        
        day_sold = property_summary_pane_2[0].text.split()[-3:][0]
        month_sold = property_summary_pane_2[0].text.split()[-3:][1]
        year_sold = int(property_summary_pane_2[0].text.split()[-3:][2])
        
        all_properties[pid]['day sold'] = day_sold
        all_properties[pid]['month sold'] = month_sold
        all_properties[pid]['year sold'] = year_sold
        
        property_type = property_extract_2.find('span',{'class':'listing-details__property-type-features-text'}).text
        all_properties[pid]['type'] = property_type
        
        try:
            if property_id['sold_price'] == 'unknown':
                property_id['sold_price'] = int(property_extract_2.find('div',{'class':'listing-details__summary-title','data-testid':'listing-details__summary-title'}).text.split("$")[1].replace(",",""))
        except:
            property_id['sold_price'] = 'unknown'
    
    else:
        all_properties[pid]['day sold'] = 'unknown'
        all_properties[pid]['month sold'] = 'unknown'
        all_properties[pid]['year sold'] = 'unknown'
        
        
    print('****')


100 of 100
Contacting https://www.domain.com.au/1015-673-la-trobe-street-docklands-vic-3008-2011168617
Response from website: 200
****


In [143]:
#Save search as a file as excel for use later if required. 
path = "./previous_searches"
if os.path.exists(path):
    pass 
else:
    os.mkdir(path)

df_previous = pd.DataFrame(all_properties)

date = datetime.now()
filename_sold = path + "/" + date.strftime("%d%b%y_%T").replace(":","")  + "_" + str(suburbs[0]) + "_" + str(state[0]) + "_" + str(postcode[0]) + "_sold.xlsx"
sheet_name = str(suburbs[0]) + str(postcode[0]) + " SOLD"

df_previous.to_excel(filename_sold,sheet_name=sheet_name, index=False)