In [375]:
import re
from json import dump
import requests

from collections import defaultdict

# user packages
from bs4 import BeautifulSoup
from urllib.request import urlopen
headers = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"}

# constants
BASE_URL = "https://www.domain.com.au"
N_PAGES = range(1, 51) # update this to your liking

# begin code
url_links = []
property_metadata = defaultdict(dict)

# generate list of urls to visit
for page in N_PAGES:
    url = BASE_URL + f"/rent/melbourne-region-vic/?sort=price-desc&page={page}"
    bs_object = BeautifulSoup(requests.get(
    url, headers=headers).text, "html.parser")

    # find the unordered list (ul) elements which are the results, then
    # find all href (a) tags that are from the base_url website.
    index_links = bs_object \
        .find(
            "ul",
            {"data-testid": "results"}
        ) \
        .findAll(
            "a",
            href=re.compile(f"{BASE_URL}/*") # the `*` denotes wildcard any
        )

    for link in index_links:
        # if its a property address, add it to the list
        if 'address' in link['class']:
            url_links.append(link['href'])


# for each url, scrape some basic metadata
for property_url in url_links[1:]:
    # bs_object = BeautifulSoup(urlopen(property_url), "lxml")
    bs_object = BeautifulSoup(requests.get(property_url, headers=headers).text, "html.parser")

    # looks for the header class to get property name
    property_metadata[property_url]['name'] = bs_object \
        .find("h1", {"class": "css-164r41r"}) \
        .text

    # looks for the div containing a summary title for cost
    property_metadata[property_url]['cost_text'] = bs_object \
        .find("div", {"data-testid": "listing-details__summary-title"}) \
        .text

    # extract coordinates from the hyperlink provided
    # i'll let you figure out what this does :P
    property_metadata[property_url]['coordinates'] = [
        float(coord) for coord in re.findall(
            r'destination=([-\s,\d\.]+)', # use regex101.com here if you need to
            bs_object \
                .find(
                    "a",
                    {"target": "_blank", 'rel': "noopener noreferer"}
                ) \
                .attrs['href']
        )[0].split(',')
    ]

    property_metadata[property_url]['rooms'] = [
        re.findall(r'\d\s[A-Za-z]+', feature.text) for feature in bs_object \
            .find("div", {"data-testid": "property-features"}) \
            .findAll("span", {"data-testid": "property-features-text-container"})
    ]
    
    information = bs_object.find("div", {"data-testid": "strip-content-list"})
    if information :
        property_metadata[property_url]['external_information'] = information.text
    else: 
        property_metadata[property_url]['external_information'] = None

    property_metadata[property_url]['desc'] = re \
        .sub(r'<br\/>', '\n', str(bs_object.find("p"))) \
        .strip('</p>')

# output to example json in data/raw/
with open('../data/raw/example.json', 'w') as f:
    dump(property_metadata, f)

KeyboardInterrupt: 

In [252]:
import json
import pandas as pd
  
# Opening JSON file
f = open('../data/raw/example.json')
  
# returns JSON object as 
# a dictionary
data = json.load(f)
df2 = pd.DataFrame.from_dict(data, orient="index")

In [253]:
l = []
for i in range(len(df2)):
    if(len(df2['rooms'][i]) != 0):
        l.append(True)
    else:
        l.append(False)
df3 = df2[l]

In [254]:
df3 = df3.reset_index()
df3.rename(columns={'index':'link'}, inplace = True)

In [255]:
df3

Unnamed: 0,link,name,cost_text,coordinates,rooms,external_information,desc
0,https://www.domain.com.au/upstairs-2c-staley-s...,Upstairs 2C Staley Street Brunswick VIC 3056,"$35,000 Annually","[-37.7655919, 144.9633048]","[[0 Beds], [0 Baths], [2 Parking]]",Date Available: Available NowInternal area 337m²,1st floor offices/studios\nEasy walking distan...
1,https://www.domain.com.au/3502-14-16-the-espla...,3502/14-16 The Esplanade St Kilda VIC 3182,"$6,000","[-37.8650177, 144.9746821]","[[3 Beds], [3 Baths], [3 Parking]]",Date Available: Available NowBond $36000,Inspired by the interplay of timeless design a...
2,https://www.domain.com.au/9-lansdowne-street-b...,9 Lansdowne Street Blairgowrie VIC 3942,"$5,000 per week","[-38.372703, 144.7856897]","[[3 Beds], [2 Baths], [3 Parking], []]","Available from Friday, 23rd December 2022Bond ...",Phone enquiry code for this property : 2751
3,https://www.domain.com.au/52-black-st-brighton...,52 Black St Brighton VIC 3186,From $5500 per week,"[-37.9159452, 144.9989003]","[[3 Beds], [4 Baths], [3 Parking], []]","Available from Tuesday, 1st November 2022Bond ...","class=""css-dxogle"">* Unverified feature<svg a..."
4,https://www.domain.com.au/4203-35-spring-stree...,4203/35 Spring Street Melbourne VIC 3000,$4500 Per Week,"[-37.8141725, 144.9740049]","[[3 Beds], [2 Baths], [4 Parking]]",Date Available: Available NowBond $27000,Possibly the most prestigious CBD address in t...
...,...,...,...,...,...,...,...
93,https://www.domain.com.au/417-250-st-kilda-roa...,417/250 St Kilda Road Southbank VIC 3006,$1800 per week,"[-37.8255907, 144.9705826]","[[3 Beds], [2 Baths], [2 Parking]]",Date Available: Available NowBond $7821,A stunningly 3 bedroom apartment located on th...
94,https://www.domain.com.au/1-125-high-st-prahra...,1/125 High St Prahran VIC 3181,"$1,799","[-37.851373, 144.990978]","[[3 Beds], [2 Baths], [2 Parking]]",Date Available: Available NowBond $7817,Property ID. 95717\nSend an online enquiry the...
95,https://www.domain.com.au/102-carpenter-street...,102 Carpenter Street Brighton VIC 3186,"Under Application-$1,750.00 pw","[-37.9107405, 145.0003207]","[[4 Beds], [3 Baths], [4 Parking]]",Date Available: Available NowBond $7604,"class=""css-dxogle"">* Unverified feature<svg a..."
96,https://www.domain.com.au/1307-14-queens-road-...,1307/14 Queens Road Melbourne VIC 3004,$1750 per week,"[-37.8389211, 144.9749946]","[[3 Beds], [2 Baths], [2 Parking]]",Date Available: Available NowBond $7584,"class=""css-dxogle"">* Unverified feature<svg a..."


In [256]:
import re
bedroom_list = []
bathroom_list = []
parking_list = []
for i in range(len(df3)):
    bedroom_list.append(re.search('\d+', df3['rooms'][i][0][0]).group())
    bathroom_list.append(re.search('\d+', df3['rooms'][i][1][0]).group())
    if(len(df3['rooms'][i][2])!= 0):
        parking_list.append(re.search('\d+', df3['rooms'][i][2][0]).group())
    else:
        parking_list.append('0')
    # df3['bathroom'][i] = re.search('\d+', df3['rooms'][i][1][0]).group()
    # df3['parking'][i] = re.search('\d+', df3['rooms'][i][2][0]).group()

In [257]:
df3['bedroom'] = bedroom_list
df3['bathroom'] = bathroom_list
df3['parking'] = parking_list

In [258]:
df3

Unnamed: 0,link,name,cost_text,coordinates,rooms,external_information,desc,bedroom,bathroom,parking
0,https://www.domain.com.au/upstairs-2c-staley-s...,Upstairs 2C Staley Street Brunswick VIC 3056,"$35,000 Annually","[-37.7655919, 144.9633048]","[[0 Beds], [0 Baths], [2 Parking]]",Date Available: Available NowInternal area 337m²,1st floor offices/studios\nEasy walking distan...,0,0,2
1,https://www.domain.com.au/3502-14-16-the-espla...,3502/14-16 The Esplanade St Kilda VIC 3182,"$6,000","[-37.8650177, 144.9746821]","[[3 Beds], [3 Baths], [3 Parking]]",Date Available: Available NowBond $36000,Inspired by the interplay of timeless design a...,3,3,3
2,https://www.domain.com.au/9-lansdowne-street-b...,9 Lansdowne Street Blairgowrie VIC 3942,"$5,000 per week","[-38.372703, 144.7856897]","[[3 Beds], [2 Baths], [3 Parking], []]","Available from Friday, 23rd December 2022Bond ...",Phone enquiry code for this property : 2751,3,2,3
3,https://www.domain.com.au/52-black-st-brighton...,52 Black St Brighton VIC 3186,From $5500 per week,"[-37.9159452, 144.9989003]","[[3 Beds], [4 Baths], [3 Parking], []]","Available from Tuesday, 1st November 2022Bond ...","class=""css-dxogle"">* Unverified feature<svg a...",3,4,3
4,https://www.domain.com.au/4203-35-spring-stree...,4203/35 Spring Street Melbourne VIC 3000,$4500 Per Week,"[-37.8141725, 144.9740049]","[[3 Beds], [2 Baths], [4 Parking]]",Date Available: Available NowBond $27000,Possibly the most prestigious CBD address in t...,3,2,4
...,...,...,...,...,...,...,...,...,...,...
93,https://www.domain.com.au/417-250-st-kilda-roa...,417/250 St Kilda Road Southbank VIC 3006,$1800 per week,"[-37.8255907, 144.9705826]","[[3 Beds], [2 Baths], [2 Parking]]",Date Available: Available NowBond $7821,A stunningly 3 bedroom apartment located on th...,3,2,2
94,https://www.domain.com.au/1-125-high-st-prahra...,1/125 High St Prahran VIC 3181,"$1,799","[-37.851373, 144.990978]","[[3 Beds], [2 Baths], [2 Parking]]",Date Available: Available NowBond $7817,Property ID. 95717\nSend an online enquiry the...,3,2,2
95,https://www.domain.com.au/102-carpenter-street...,102 Carpenter Street Brighton VIC 3186,"Under Application-$1,750.00 pw","[-37.9107405, 145.0003207]","[[4 Beds], [3 Baths], [4 Parking]]",Date Available: Available NowBond $7604,"class=""css-dxogle"">* Unverified feature<svg a...",4,3,4
96,https://www.domain.com.au/1307-14-queens-road-...,1307/14 Queens Road Melbourne VIC 3004,$1750 per week,"[-37.8389211, 144.9749946]","[[3 Beds], [2 Baths], [2 Parking]]",Date Available: Available NowBond $7584,"class=""css-dxogle"">* Unverified feature<svg a...",3,2,2
