In [1]:
import requests
from bs4 import BeautifulSoup
import re

from random_user_agent.user_agent import UserAgent
from random_user_agent.params import SoftwareName, OperatingSystem

import time
from random import random
from datetime import datetime
import os

import pandas as pd
import numpy as np

from pprint import pprint


In [2]:
#Function to generate a randomised user agent for each connection 
def agent_randomiser():
    software_names = [SoftwareName.CHROME.value]
    operating_systems = [OperatingSystem.WINDOWS.value, OperatingSystem.LINUX.value]   

    user_agent_rotator = UserAgent(software_names=software_names, operating_systems=operating_systems, limit=100)

    # Get list of user agents.
    user_agents = user_agent_rotator.get_user_agents()

    # Get Random User Agent String.
    user_agent = user_agent_rotator.get_random_user_agent()
    
    user_agent2 = {"User-Agent": user_agent}
    
    return user_agent2

In [3]:
#Specify suburbs; note if multiple suburbs, then need to 
suburbs = ['blackburn']
state = ['vic']
postcode = ['3130']

# create a list of root urls of suburbs to scrape
if len(suburbs) == 1:   
    root_url =  'https://www.domain.com.au/sold-listings/%s-%s-%s/?excludepricewithheld=1&ssubs=0' %(suburbs[0],state[0],postcode[0])
else:
    root_url = []
    for i,suburb in enumerate(suburbs):
        root_url.append('https://www.domain.com.au/sold-listings/%s-%s-%s/?excludepricewithheld=1&ssubs=0' %(suburb,state[i],postcode[i]))

In [4]:
# headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'}

search={}

res = requests.get(root_url, headers=agent_randomiser())

if res.status_code == 200:
    print("Response from website:",res.status_code)

    soup_the_page = BeautifulSoup(res.content,features='lxml')
    properties_total_return = soup_the_page.find_all('h1',{'data-testid': 'summary'})
    total_properties = properties_total_return[0].text.split()[0]

    #This rounds up the number of URLs to srape from the base url
    total_pages = -(-(int(total_properties))//20)
    print("Pages to Search:",total_pages)

    #This is where the extracted information is appended to the search dictionary for use in later processing. 
    search.update({suburbs[0]: {'Suburb': suburbs[0],
                                'Total properties':total_properties,
                                'Pages to Search':total_pages,
                                'Root Url':root_url,
                                'Website Response': res.status_code}})

    print("Appended to the Search dictionary")
else:
    print("Response from website:",res.status_code)
    
    search.update({suburbs[0]: {'Suburb': 'UNABLE TO EXTRACT',
                                'Total properties':'UNABLE TO EXTRACT',
                                'Pages to Search':'UNABLE TO EXTRACT',
                                'Root Url':root_url,
                                'Website Response': res.status_code}})

print('\n',search)

Response from website: 200
Pages to Search: 80
Appended to the Search dictionary

 {'blackburn': {'Suburb': 'blackburn', 'Total properties': '1587', 'Pages to Search': 80, 'Root Url': 'https://www.domain.com.au/sold-listings/blackburn-vic-3130/?excludepricewithheld=1&ssubs=0', 'Website Response': 200}}


In [5]:
#6. This creates the list of specific job page urls to iterate through
#This is **NOT** the individual job ads but rather the top-level search pages for each search criteria that will need to be
#searched through to extract specific job URLs that will contain the required information.
#Output is appended to the search dictionary under the key URL Search Page List for each search term

for k,value in search.items():
#     print(k,value)
    web_url_iterations = []
    #This get the root URL and drops the final character which is a 0 for the page
    url = value.get('Root Url') + str('&page=') #[:-1]
    
    if value.get('Pages to Search') > 50:
        web_url_iterations.append(value.get('Root Url'))
        iterations = 2                             #<--THIS HAS BEEN SET TO 2 FOR TESTING. UPDATE TO 50 ONCE CONFIRMED WORKING
        for i in range(1,iterations):
            web_url_iterations.append(str(url + str((i+1))))
            value.update({'URL Search Page List': web_url_iterations})
            
    elif value.get('Pages to Search') == 1:
        web_url_iterations.append(value.get('Root Url'))
    
    elif value.get('Pages to Search') == 'UNABLE TO EXTRACT':
        print('Was unable to extract a root url. Check website response')
        value.update({'URL Search Page List': 'Was unable to extract a root url. Check website response'})
    
    else:
        web_url_iterations.append(value.get('Root Url'))
        iterations = value.get('Pages to Search')
        for i in range(1,iterations):
            web_url_iterations.append(str(url + str((i+1))))
            value.update({'URL Search Page List': web_url_iterations})


In [6]:
for k,value in search.items():
    search_url = value.get('URL Search Page List')
    unique_property_url = []
    
    for url in search_url:
        print('------------')
        print(url)
        
        time.sleep(random()*4)
        
        content = requests.get(url, headers=agent_randomiser())
        print("Response from website:",content.status_code)
        
        if content.status_code == 200:
            property_pane = BeautifulSoup(content.content,features='lxml')
            unique_links = property_pane.find_all('link',{'itemprop':'url'})
            for a in unique_links:
                unique_property_url.append(str(a).split('"')[1])
            value.update({'Specific property URLs': unique_property_url})
        else:
            print('An issue with the website occured. Please check website status code')


------------
https://www.domain.com.au/sold-listings/blackburn-vic-3130/?excludepricewithheld=1&ssubs=0
Response from website: 200
------------
https://www.domain.com.au/sold-listings/blackburn-vic-3130/?excludepricewithheld=1&ssubs=0&page=2
Response from website: 200


In [7]:
unique_property_brochure = []

for k,value in search.items():
    list_values = value.get('Specific property URLs')
    
    for pid in list_values:
        unique_id = pid.split("-")[-1]
        brochure_url =  'https://www.domain.com.au/Public/PropertyBrochure.aspx?adid=%s&mode=sold' %(unique_id)
        unique_property_brochure.append(brochure_url)
#         print(brochure_url)
        
    value.update({'Brochure Links':unique_property_brochure})

In [8]:
all_properties=[]

counter = 0

for k,value in search.items():
    print('\nGetting information for',str(value.get('Suburb')).upper())
    print('---------------')
    property_urls_suburb = value.get('Brochure Links')
    
    property_original_urls = value.get('Specific property URLs')
#     print('List of property links for', str(value.get('Suburb')).upper())
#     pprint(property_urls_suburb)
    
    for i in property_urls_suburb:
        
#         os.system('cls')
#         print(i)
        time.sleep(random()*5)
        property_info = {}
        
        print('Connecting to ', i)
        web_response3 = requests.get(i, headers=agent_randomiser())
        print("Response from website:",web_response3.status_code)
        if web_response3.status_code == 200:
            property_extract = BeautifulSoup(web_response3.content,features='lxml')                    
            property_summary_pane = property_extract.find_all('div',{'style':'margin: 0 10px 10px'})
            
            #Search date for appending
            search_date_time = datetime.now()
            property_info['search_date'] = search_date_time.strftime("%d %b %y %T")
            
            #Original url
            property_info['url'] = i
            
            #Property address
            address = property_extract.find('h2').text.split("\r")[0]
            property_info['address'] = address
            
            #Property suburb
            property_info['suburb'] = k
            
            #Property sale value
            try:
                sold_price = property_extract.find('h1').text.split("$")[1].replace(",","")
                property_info['sold_price'] = sold_price
            except:
                property_info['sold_price'] = 'unknown'
            
            #Extract beds, baths, carspaces, and Land Area
            appending_info = []            
            
            strong_list = []

            for strong_tag in property_summary_pane[0].find_all('strong'):
            #     print(strong_tag.next_sibling.split())
                strong_list.append(strong_tag.next_sibling.split())

            for x in strong_list:
                if len(x) == 1:
                    pass
                elif len(x)>1:
                    property_info['bedrooms'] = 'unknown'
                    property_info['bathrooms'] = 'unknown'
                    property_info['carspaces'] = 'unknown'
                    for i,o in enumerate(x):
                        if o == 'bedrooms' or o == 'bedroom':
                            property_info['bedrooms'] = x[i-1]
                        if o == 'bathrooms' or o == 'bathroom':
                            property_info['bathrooms'] = x[i-1]
                        if o == 'carspace' or o == 'carspaces':
                            property_info['carspaces'] = x[i-1]  
    
            
            for st in property_extract.find_all('strong'):
                property_info['land area'] = 'unknown'
                if st.text == "Land area:":
                    property_info['land area'] = float(st.next_sibling.split()[0])
#                     print(float(st.next_sibling.split()[0]))
            
            all_properties.append(property_info)
            print('****')
        else:
            print("!!!!!!!!!!!!!!!!!!!!!")
            print("Something went wrong!")
            print("!!!!!!!!!!!!!!!!!!!!!")

            
            



Getting information for BLACKBURN
---------------
Connecting to  https://www.domain.com.au/Public/PropertyBrochure.aspx?adid=2016114099&mode=sold
Response from website: 200
****
Connecting to  https://www.domain.com.au/Public/PropertyBrochure.aspx?adid=2016129899&mode=sold
Response from website: 200
****
Connecting to  https://www.domain.com.au/Public/PropertyBrochure.aspx?adid=2016127111&mode=sold
Response from website: 200
****
Connecting to  https://www.domain.com.au/Public/PropertyBrochure.aspx?adid=2016118973&mode=sold
Response from website: 200
****
Connecting to  https://www.domain.com.au/Public/PropertyBrochure.aspx?adid=2016115727&mode=sold
Response from website: 200
****
Connecting to  https://www.domain.com.au/Public/PropertyBrochure.aspx?adid=2016097661&mode=sold
Response from website: 200
****
Connecting to  https://www.domain.com.au/Public/PropertyBrochure.aspx?adid=2016093219&mode=sold
Response from website: 200
****
Connecting to  https://www.domain.com.au/Public/Prope

In [109]:
all_properties[0].value('url')

AttributeError: 'dict' object has no attribute 'value'

In [103]:
for pid,property_id in enumerate(all_properties):
    print(property_id)
    
    
#     time.sleep(random()*5)
#     property_info_2 = {}
    
#     web_response4 = requests.get(search[suburbs[0]]['Specific property URLs'][pid], headers=agent_randomiser())
#     print("Contacting", property_id)
#     print("Response from website:",web_response4.status_code)
    
#     if web_response4.status_code == 200:
#         property_extract_2 = BeautifulSoup(web_response4.content,features='lxml')                    
#         property_summary_pane_2 = property_extract_2.find_all('span',{'class':'listing-details__listing-tag is-sold listing-details__summary-tag'})
        
#         day_sold = property_summary_pane_2[0].text.split()[-3:][0]
#         month_sold = property_summary_pane_2[0].text.split()[-3:][1]
#         year_sold = property_summary_pane_2[0].text.split()[-3:][2]
        
#         property_id['day sold'] = day_sold
#         property_id['month sold'] = month_sold
#         property_id['year sold'] = year_sold
    
#     else:
#         property_id['day sold'] = 'unknown'
#         property_id['month sold'] = 'unknown'
#         property_id['year sold'] = 'unknown'
        
#     print('****')
    
#     original_property_link = se
#     print(pid)

{'search_date': '18 Apr 20 16:26:54', 'url': 'https://www.domain.com.au/Public/PropertyBrochure.aspx?adid=2016114099&mode=sold', 'address': '22 Francis Street', 'suburb': 'blackburn', 'sold_price': '1199000', 'bedrooms': '4', 'bathrooms': '1', 'carspaces': '2', 'land area': 638.0, 'day sold': '9th', 'month sold': 'April', 'year sold': '2020'}
{'search_date': '18 Apr 20 16:26:57', 'url': 'https://www.domain.com.au/Public/PropertyBrochure.aspx?adid=2016129899&mode=sold', 'address': '21 Patricia Road', 'suburb': 'blackburn', 'sold_price': '1255000', 'bedrooms': '2', 'bathrooms': '1', 'carspaces': '2', 'land area': 651.0, 'day sold': '4th', 'month sold': 'April', 'year sold': '2020'}
{'search_date': '18 Apr 20 16:27:01', 'url': 'https://www.domain.com.au/Public/PropertyBrochure.aspx?adid=2016127111&mode=sold', 'address': '4/21 Laburnum Street', 'suburb': 'blackburn', 'sold_price': '1190000', 'bedrooms': '3', 'bathrooms': '2', 'carspaces': '2', 'land area': 274.0, 'day sold': '1st', 'month 

In [97]:
property_id

{'search_date': '18 Apr 20 16:27:15',
 'url': 'https://www.domain.com.au/Public/PropertyBrochure.aspx?adid=2016078202&mode=sold',
 'address': '28 Tyrrell Avenue',
 'suburb': 'blackburn',
 'sold_price': '1690000',
 'bedrooms': '3',
 'bathrooms': '2',
 'carspaces': '1',
 'land area': 1190.0}

In [94]:
property_summary_pane_2[0].text.split()[-3:][0]


'1st'

In [None]:
        for index,property_id in enumerate(property_original_urls):
            time.sleep(random()*5)
            property_info_original = {}
        
            print('Connecting to ', property_id)
        web_response3 = requests.get(i, headers=agent_randomiser())
        print("Response from website:",web_response3.status_code)
        if web_response3.status_code == 200:
            property_extract = BeautifulSoup(web_response3.content,features='lxml')                    
            property_summary_pane = property_extract.find_all('div',{'style':'margin: 0 10px 10px'})            
        
#         os.system('cls')
#         print(i)
        time.sleep(random()*5)
        property_info = {}

In [35]:
dt_list = []

description_test = property_extract.find_all('h3',{'class':'agentheading'})
print(description_test[2].next_sibling)
for d in description_test:
    print(d.next_sibling)


					Brand new on own title
					





					Brand new on own title
					


In [10]:
df = pd.DataFrame(all_properties)
df.to_csv('df.csv')
df

Unnamed: 0,search_date,url,address,suburb,sold_price,bedrooms,bathrooms,carspaces,land area
0,18 Apr 20 16:26:54,https://www.domain.com.au/Public/PropertyBroch...,22 Francis Street,blackburn,1199000,4,1,2,638
1,18 Apr 20 16:26:57,https://www.domain.com.au/Public/PropertyBroch...,21 Patricia Road,blackburn,1255000,2,1,2,651
2,18 Apr 20 16:27:01,https://www.domain.com.au/Public/PropertyBroch...,4/21 Laburnum Street,blackburn,1190000,3,2,2,274
3,18 Apr 20 16:27:02,https://www.domain.com.au/Public/PropertyBroch...,1 Harold Street,blackburn,1180000,5,2,2,591
4,18 Apr 20 16:27:07,https://www.domain.com.au/Public/PropertyBroch...,15 Pakenham Street,blackburn,1400000,3,1,2,692
5,18 Apr 20 16:27:09,https://www.domain.com.au/Public/PropertyBroch...,87 Canterbury Road,blackburn,1075000,4,2,2,611
6,18 Apr 20 16:27:12,https://www.domain.com.au/Public/PropertyBroch...,1/24-26 Service Road,blackburn,1108000,3,2,2,305
7,18 Apr 20 16:27:13,https://www.domain.com.au/Public/PropertyBroch...,39 Pope Road,blackburn,1338000,3,2,1,600
8,18 Apr 20 16:27:15,https://www.domain.com.au/Public/PropertyBroch...,28 Tyrrell Avenue,blackburn,1690000,3,2,1,1190
9,18 Apr 20 16:27:20,https://www.domain.com.au/Public/PropertyBroch...,314/21 Queen Street,blackburn,530000,2,2,2,unknown
