In [65]:
import re
from json import dump
import requests

from collections import defaultdict

# user packages
from bs4 import BeautifulSoup
from urllib.request import urlopen
headers = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"}

# constants
BASE_URL = "https://www.domain.com.au"
N_PAGES = range(1, 51) # update this to your liking

# begin code
url_links = []
property_metadata = defaultdict(dict)

# generate list of urls to visit
for page in N_PAGES:
    url = BASE_URL + f"/rent/melbourne-region-vic/?sort=price-desc&page={page}"
    bs_object = BeautifulSoup(requests.get(
    url, headers=headers).text, "html.parser")

    # find the unordered list (ul) elements which are the results, then
    # find all href (a) tags that are from the base_url website.
    index_links = bs_object \
        .find(
            "ul",
            {"data-testid": "results"}
        ) \
        .findAll(
            "a",
            href=re.compile(f"{BASE_URL}/*") # the `*` denotes wildcard any
        )

    for link in index_links:
        # if its a property address, add it to the list
        if 'address' in link['class']:
            url_links.append(link['href'])


# for each url, scrape some basic metadata
for property_url in url_links[1:]:
    # bs_object = BeautifulSoup(urlopen(property_url), "lxml")
    bs_object = BeautifulSoup(requests.get(property_url, headers=headers).text, "html.parser")

    # looks for the header class to get property name
    property_metadata[property_url]['name'] = bs_object \
        .find("h1", {"class": "css-164r41r"}) \
        .text

    # looks for the div containing a summary title for cost
    property_metadata[property_url]['cost_text'] = bs_object \
        .find("div", {"data-testid": "listing-details__summary-title"}) \
        .text

    # extract coordinates from the hyperlink provided
    # i'll let you figure out what this does :P
    property_metadata[property_url]['coordinates'] = [
        float(coord) for coord in re.findall(
            r'destination=([-\s,\d\.]+)', # use regex101.com here if you need to
            bs_object \
                .find(
                    "a",
                    {"target": "_blank", 'rel': "noopener noreferer"}
                ) \
                .attrs['href']
        )[0].split(',')
    ]

    property_metadata[property_url]['rooms'] = [
        re.findall(r'\d\s[A-Za-z]+', feature.text) for feature in bs_object \
            .find("div", {"data-testid": "property-features"}) \
            .findAll("span", {"data-testid": "property-features-text-container"})
    ]
    
    information = bs_object.find("div", {"data-testid": "strip-content-list"})
    if information :
        property_metadata[property_url]['external_information'] = information.text
    else: 
        property_metadata[property_url]['external_information'] = None

    property_metadata[property_url]['desc'] = re \
        .sub(r'<br\/>', '\n', str(bs_object.find("p"))) \
        .strip('</p>')

# output to example json in data/raw/
with open('../data/raw/example.json', 'w') as f:
    dump(property_metadata, f)

In [79]:
import json
import pandas as pd
  
# Opening JSON file
f = open('../data/raw/example.json')
  
# returns JSON object as 
# a dictionary
data = json.load(f)
df2 = pd.DataFrame.from_dict(data, orient="index")

In [80]:
df2

Unnamed: 0,name,cost_text,coordinates,rooms,external_information,desc
https://www.domain.com.au/upstairs-2c-staley-street-brunswick-vic-3056-13168913,Upstairs 2C Staley Street Brunswick VIC 3056,"$35,000 Annually","[-37.7655919, 144.9633048]","[[0 Beds], [0 Baths], [2 Parking]]",Date Available: Available NowInternal area 337m²,1st floor offices/studios\nEasy walking distan...
https://www.domain.com.au/level-3-302-13-15-lake-street-caroline-springs-vic-3023-15994395,"Level 3, 302/13-15 Lake Street Caroline Spring...",4125000 pw,"[-37.7316459, 144.7446886]",[],Date Available: Available Now,- Centrally located with tranquil views.\n- Le...
https://www.domain.com.au/3502-14-16-the-esplanade-st-kilda-vic-3182-16002767,3502/14-16 The Esplanade St Kilda VIC 3182,"$6,000","[-37.8650177, 144.9746821]","[[3 Beds], [3 Baths], [3 Parking]]",Date Available: Available NowBond $36000,Inspired by the interplay of timeless design a...
https://www.domain.com.au/9-lansdowne-street-blairgowrie-vic-3942-12127675,9 Lansdowne Street Blairgowrie VIC 3942,"$5,000 per week","[-38.372703, 144.7856897]","[[3 Beds], [2 Baths], [3 Parking], []]","Available from Friday, 23rd December 2022Bond ...",Phone enquiry code for this property : 2751
https://www.domain.com.au/52-black-st-brighton-vic-3186-15410646,52 Black St Brighton VIC 3186,From $5500 per week,"[-37.9159452, 144.9989003]","[[3 Beds], [4 Baths], [3 Parking], []]","Available from Tuesday, 1st November 2022Bond ...","class=""css-dxogle"">* Unverified feature<svg a..."
...,...,...,...,...,...,...
https://www.domain.com.au/1-11-bay-street-parkdale-vic-3195-16050233,1/11 Bay Street Parkdale VIC 3195,$795,"[-38.00308, 145.0810804]","[[3 Beds], [2 Baths], [2 Parking]]",Date Available: Available NowBond $3454,"class=""css-dxogle"">* Unverified feature<svg a..."
https://www.domain.com.au/2-sutton-grove-richmond-vic-3121-16056837,2 Sutton Grove Richmond VIC 3121,$795 per week,"[-37.8225969, 144.990798]","[[2 Beds], [2 Baths], [1 Parking]]","Available from Friday, 2nd September 2022Bond ...",Angela Clark
https://www.domain.com.au/2-261-williams-road-south-yarra-vic-3141-16043406,2/261 Williams Road South Yarra VIC 3141,$795 per week,"[-37.8420314, 145.004217]","[[2 Beds], [1 Bath], [1 Parking]]",Date Available: Available NowBond $3454,"Like nothing you've ever encountered, this tho..."
https://www.domain.com.au/1005e-18-hoff-bvd-southbank-vic-3006-16005292,1005E/18 Hoff Bvd Southbank VIC 3006,$795,"[-37.825817, 144.9640414]","[[3 Beds], [2 Baths], [1 Parking]]",Date Available: Available NowBond $3454,This rare find high floor premium apartment wi...


In [81]:
l = []
for i in range(len(df2)):
    if(len(df2['rooms'][i]) != 0):
        l.append(True)
    else:
        l.append(False)
df3 = df2[l]

In [82]:
df3 = df3.reset_index()
df3.rename(columns={'index':'link'}, inplace = True)

In [84]:
import re
bedroom_list = []
bathroom_list = []
parking_list = []
for i in range(len(df3)):
    bedroom_list.append(re.search('\d+', df3['rooms'][i][0][0]).group())
    bathroom_list.append(re.search('\d+', df3['rooms'][i][1][0]).group())
    if(len(df3['rooms'][i][2])!= 0):
        parking_list.append(re.search('\d+', df3['rooms'][i][2][0]).group())
    else:
        parking_list.append('0')
    # df3['bathroom'][i] = re.search('\d+', df3['rooms'][i][1][0]).group()
    # df3['parking'][i] = re.search('\d+', df3['rooms'][i][2][0]).group()

In [85]:
df3['bedroom'] = bedroom_list
df3['bathroom'] = bathroom_list
df3['parking'] = parking_list

In [86]:
df3

Unnamed: 0,link,name,cost_text,coordinates,rooms,external_information,desc,bedroom,bathroom,parking
0,https://www.domain.com.au/upstairs-2c-staley-s...,Upstairs 2C Staley Street Brunswick VIC 3056,"$35,000 Annually","[-37.7655919, 144.9633048]","[[0 Beds], [0 Baths], [2 Parking]]",Date Available: Available NowInternal area 337m²,1st floor offices/studios\nEasy walking distan...,0,0,2
1,https://www.domain.com.au/3502-14-16-the-espla...,3502/14-16 The Esplanade St Kilda VIC 3182,"$6,000","[-37.8650177, 144.9746821]","[[3 Beds], [3 Baths], [3 Parking]]",Date Available: Available NowBond $36000,Inspired by the interplay of timeless design a...,3,3,3
2,https://www.domain.com.au/9-lansdowne-street-b...,9 Lansdowne Street Blairgowrie VIC 3942,"$5,000 per week","[-38.372703, 144.7856897]","[[3 Beds], [2 Baths], [3 Parking], []]","Available from Friday, 23rd December 2022Bond ...",Phone enquiry code for this property : 2751,3,2,3
3,https://www.domain.com.au/52-black-st-brighton...,52 Black St Brighton VIC 3186,From $5500 per week,"[-37.9159452, 144.9989003]","[[3 Beds], [4 Baths], [3 Parking], []]","Available from Tuesday, 1st November 2022Bond ...","class=""css-dxogle"">* Unverified feature<svg a...",3,4,3
4,https://www.domain.com.au/4203-35-spring-stree...,4203/35 Spring Street Melbourne VIC 3000,$4500 Per Week,"[-37.8141725, 144.9740049]","[[3 Beds], [2 Baths], [4 Parking]]",Date Available: Available NowBond $27000,Possibly the most prestigious CBD address in t...,3,2,4
...,...,...,...,...,...,...,...,...,...,...
989,https://www.domain.com.au/1-bluff-rd-merricks-...,1 Bluff Rd Merricks Beach VIC 3926,$795,"[-38.3997481, 145.1048468]","[[3 Beds], [2 Baths], [2 Parking]]",Date Available: Available NowBond $3454,"class=""css-dxogle"">* Unverified feature<svg a...",3,2,2
990,https://www.domain.com.au/1-11-bay-street-park...,1/11 Bay Street Parkdale VIC 3195,$795,"[-38.00308, 145.0810804]","[[3 Beds], [2 Baths], [2 Parking]]",Date Available: Available NowBond $3454,"class=""css-dxogle"">* Unverified feature<svg a...",3,2,2
991,https://www.domain.com.au/2-sutton-grove-richm...,2 Sutton Grove Richmond VIC 3121,$795 per week,"[-37.8225969, 144.990798]","[[2 Beds], [2 Baths], [1 Parking]]","Available from Friday, 2nd September 2022Bond ...",Angela Clark,2,2,1
992,https://www.domain.com.au/2-261-williams-road-...,2/261 Williams Road South Yarra VIC 3141,$795 per week,"[-37.8420314, 145.004217]","[[2 Beds], [1 Bath], [1 Parking]]",Date Available: Available NowBond $3454,"Like nothing you've ever encountered, this tho...",2,1,1


In [87]:
def amount(text):
    amount = re.findall(r'\d+,?.\d+.?\d+',text)
    if len(amount) == 0:
        return None
    elif len(amount) == 1:       
        rent = float(amount[0].replace(',',''))
    else:
        sum = 0
        for num in amount:
            num = float(num.replace(',',''))
            sum += num
        rent = sum/len(amount)
    if('Annually' in text):
        weekly_rent = rent/52
    else:
        weekly_rent = rent
    return weekly_rent
        
        


In [88]:
df3['weekly_rent'] = df3['cost_text'].apply(amount)

In [89]:
def postcode(text):
    postcode = re.findall(r'VIC \d+',text)
    if(len(postcode) == 0):
        return None
    else:
        return postcode[0].split(' ')[1]
   

In [90]:
df3['postcode'] = df3['name'].apply(postcode)

In [102]:
def bond(text):
    if(text is None):
        return None
    if('Bond' in text):
        amount = re.findall(r'Bond \$\d+',text)
        if (len(amount) == 1):
            amount = float(amount[0].split(' ')[1][1:])
            return amount
   
    else:
        return None


In [103]:
df3['bond'] = df3['external_information'].apply(bond)
df3

Unnamed: 0,link,name,cost_text,coordinates,rooms,external_information,desc,bedroom,bathroom,parking,weekly_rent,postcode,bond
0,https://www.domain.com.au/upstairs-2c-staley-s...,Upstairs 2C Staley Street Brunswick VIC 3056,"$35,000 Annually","[-37.7655919, 144.9633048]","[[0 Beds], [0 Baths], [2 Parking]]",Date Available: Available NowInternal area 337m²,1st floor offices/studios\nEasy walking distan...,0,0,2,673.076923,3056,
1,https://www.domain.com.au/3502-14-16-the-espla...,3502/14-16 The Esplanade St Kilda VIC 3182,"$6,000","[-37.8650177, 144.9746821]","[[3 Beds], [3 Baths], [3 Parking]]",Date Available: Available NowBond $36000,Inspired by the interplay of timeless design a...,3,3,3,6000.000000,3182,36000.0
2,https://www.domain.com.au/9-lansdowne-street-b...,9 Lansdowne Street Blairgowrie VIC 3942,"$5,000 per week","[-38.372703, 144.7856897]","[[3 Beds], [2 Baths], [3 Parking], []]","Available from Friday, 23rd December 2022Bond ...",Phone enquiry code for this property : 2751,3,2,3,5000.000000,3942,2500.0
3,https://www.domain.com.au/52-black-st-brighton...,52 Black St Brighton VIC 3186,From $5500 per week,"[-37.9159452, 144.9989003]","[[3 Beds], [4 Baths], [3 Parking], []]","Available from Tuesday, 1st November 2022Bond ...","class=""css-dxogle"">* Unverified feature<svg a...",3,4,3,5500.000000,3186,20000.0
4,https://www.domain.com.au/4203-35-spring-stree...,4203/35 Spring Street Melbourne VIC 3000,$4500 Per Week,"[-37.8141725, 144.9740049]","[[3 Beds], [2 Baths], [4 Parking]]",Date Available: Available NowBond $27000,Possibly the most prestigious CBD address in t...,3,2,4,4500.000000,3000,27000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
989,https://www.domain.com.au/1-bluff-rd-merricks-...,1 Bluff Rd Merricks Beach VIC 3926,$795,"[-38.3997481, 145.1048468]","[[3 Beds], [2 Baths], [2 Parking]]",Date Available: Available NowBond $3454,"class=""css-dxogle"">* Unverified feature<svg a...",3,2,2,,3926,3454.0
990,https://www.domain.com.au/1-11-bay-street-park...,1/11 Bay Street Parkdale VIC 3195,$795,"[-38.00308, 145.0810804]","[[3 Beds], [2 Baths], [2 Parking]]",Date Available: Available NowBond $3454,"class=""css-dxogle"">* Unverified feature<svg a...",3,2,2,,3195,3454.0
991,https://www.domain.com.au/2-sutton-grove-richm...,2 Sutton Grove Richmond VIC 3121,$795 per week,"[-37.8225969, 144.990798]","[[2 Beds], [2 Baths], [1 Parking]]","Available from Friday, 2nd September 2022Bond ...",Angela Clark,2,2,1,,3121,3454.0
992,https://www.domain.com.au/2-261-williams-road-...,2/261 Williams Road South Yarra VIC 3141,$795 per week,"[-37.8420314, 145.004217]","[[2 Beds], [1 Bath], [1 Parking]]",Date Available: Available NowBond $3454,"Like nothing you've ever encountered, this tho...",2,1,1,,3141,3454.0


In [107]:
df3.to_csv("cleaned.csv")