In [25]:
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import pandas as pd
import re
import time
import sys

In [26]:
def get_location(detail):
    location = detail.find('span',{'data-testid':"address-line2"}).get_text()
    return location

In [27]:
def get_price(detail):
    price = detail.p.get_text()
    return price
    

In [28]:
def get_features(detail):
    feature_list = []
    features = detail.find_all("span", {"class": "css-lvv8is"})
    bed = features[0].get_text()
    bath = features[1].get_text()
    parking = features[2].get_text()
    feature_list.extend([bed,bath,parking])
    return feature_list

In [29]:
def job_done(page):
    print(f'Job done, {page} pages returned')
    sys.exit()
    

In [30]:
def get_next_page (page,next_page,original_url):
    if page==1 and len(next_page)==0: #check if there is only one page of listings 
        job_done(page)
    if page>1 and len(next_page)==1: #check if current page is last page of listings 
        job_done(page)
    elif page==1:
        next_page = next_page[0].get('href')
    else:
        next_page = next_page[1].get('href')
        
    last_slash_index = next_page.rfind("/")
    next_page = original_url + next_page[last_slash_index + 1:]
    
    return next_page #return url of next page in listings
    

In [31]:
#dataframe to hold property data
df = pd.DataFrame(columns=['Price', 'Property_Type', 'Bed',
                'Bath','Parking','Location','Link'])
counter = 0 #counter to track number of properties parsed
page = 1 #counter to track number of pages
row_index=0 #indexer for dataframe
url = ('https://www.domain.com.au/rent/sydney-nsw-2000/') #url of first page
original_url = url

while True:
    req = Request(url, headers = {'User-Agent':'Safari/537.36'})
    try:
        webpage = urlopen(req).read()
    except:
        sys.exit('page not returned')
    soup = BeautifulSoup(webpage, 'html.parser')
    details = soup.find_all("div", {"class": "css-qrqvvg"})
    if len(details) == 0:
        job_done(page)
    for detail in details:
        counter+=1
        try:
            price = get_price(detail)
            prop_type = detail.find("span", {"class": "css-693528"}).get_text()
            bed = get_features(detail)[0]
            bath = get_features(detail)[1]
            parking = get_features(detail)[2]
            location = get_location(detail)
            link = detail.a.get('href')
            
            #create dictionary of property features
            info_dict = {'Price':price, 'Property_Type':prop_type, 
                'Bed':bed, 'Bath':bath,'Parking':parking,
                'Location':location,'Link':link}
            info_df = pd.DataFrame(info_dict, index=[row_index])
            df = pd.concat([df, info_df])
            row_index+=1
            
        #print details of any properties not entered
        except:
            print('property number {} not entered'.format(counter))
            print(url)
            print(link)
            
    print(len(df)) #output length of dataframe after each page is parsed
    
    next_page = soup.find_all("a", {"class": "css-xixru3"}) #gets list of urls to neighbouring page(s)
    url = get_next_page(page,next_page,original_url)    
    page+=1
    time.sleep(5)

21
41
61
81
101
121
141
161
181
201
221
property number 229 not entered
https://www.domain.com.au/rent/sydney-nsw-2000/?page=12
https://www.domain.com.au/91-liverpool-street-sydney-nsw-2000-16432912
240
property number 253 not entered
https://www.domain.com.au/rent/sydney-nsw-2000/?page=13
https://www.domain.com.au/level-69-6902-117-bathurst-street-sydney-nsw-2000-16867364
259
279
299
319
339
359
379
399
419
439
459
479
499
519
539
559
579
599
619
639
659
679
699
719
property number 732 not entered
https://www.domain.com.au/rent/sydney-nsw-2000/?page=37
https://www.domain.com.au/503-1-wattle-crescent-pyrmont-nsw-2009-16930408
738
758
778
798
818
838
858
property number 876 not entered
https://www.domain.com.au/rent/sydney-nsw-2000/?page=44
https://www.domain.com.au/601-88-hay-street-haymarket-nsw-2000-16725238
872
Job done, 44 pages returned


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [32]:
df.to_csv('properties_raw_df.csv', index=False) 