In [7]:
"""
A very simple and basic web scraping script. Feel free to
use this as a source of inspiration, but, make sure to attribute
it if you do so.

This is by no means production code.
"""
# built-in imports
import re
import requests
from json import dump

from collections import defaultdict

# user packages
from bs4 import BeautifulSoup
from urllib.request import urlopen

# constants
BASE_URL = "https://www.domain.com.au"
N_PAGES = range(1, 51) # update this to your liking

#https://www.domain.com.au/rent/melbourne-region-vic/?sort=price-desc&page={1}


headers = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"}

# begin code
url_links = []
property_metadata = defaultdict(dict)

# generate list of urls to visit
for page in N_PAGES:

    #the full url 
    url = BASE_URL + f"/rent/melbourne-region-vic/?sort=price-desc&page={page}"

    #parse into a BeautifulSoup object
    bs_object = BeautifulSoup(requests.get(url, headers=headers).text, "html.parser")

    # find the unordered list (ul) elements which are the results, then
    # find all href (a) tags that are from the base_url website.
    index_links = bs_object \
        .find(
            "ul",
            {"data-testid": "results"}
        ) \
        .findAll(
            "a",
            href=re.compile(f"{BASE_URL}/*") # the `*` denotes wildcard any
        )

    for link in index_links:
        # if its a property address, add it to the list
        if 'address' in link['class']:
            url_links.append(link['href'])


# for each url, scrape some basic metadata
for property_url in url_links[1:]:

    bs_object = BeautifulSoup(requests.get(property_url, headers=headers).text, "html.parser")

    # looks for the header class to get property name
    property_metadata[property_url]['name'] = bs_object \
        .find("h1", {"class": "css-164r41r"}) \
        .text

    # looks for the div containing a summary title for cost
    property_metadata[property_url]['cost_text'] = bs_object \
        .find("div", {"data-testid": "listing-details__summary-title"}) \
        .text

    # extract coordinates from the hyperlink provided
    # i'll let you figure out what this does :P
    property_metadata[property_url]['coordinates'] = [
        float(coord) for coord in re.findall(
            r'destination=([-\s,\d\.]+)', # use regex101.com here if you need to
            bs_object \
                .find(
                    "a",
                    {"target": "_blank", 'rel': "noopener noreferer"}
                ) \
                .attrs['href']
        )[0].split(',')
    ]

    
    #Keith's note: DEBUGGING INDEX ERROR: try to ignore the IndexError, see if any other error pops up
    rooms_list = []
    for feature in bs_object.find("div", {"data-testid": "property-features"}).findAll("span", {"data-testid": "property-features-text-container"}):
        try:
            rooms_list.append(re.findall(r'\d\s[A-Za-z]+', feature.text)[0])
        except IndexError:
            pass

    property_metadata[property_url]['rooms'] = rooms_list
    
    #Keith's note: below is the code that was actually given but replaced by above
    # property_metadata[property_url]['rooms'] = [
    #     re.findall(r'\d\s[A-Za-z]+', feature.text)[0] for feature in bs_object \
    #         .find("div", {"data-testid": "property-features"}) \
    #         .findAll("span", {"data-testid": "property-features-text-container"})
    # ]

    property_metadata[property_url]['desc'] = re \
        .sub(r'<br\/>', '\n', str(bs_object.find("p"))) \
        .strip('</p>')

# output to example json in data/raw/
with open('../data/raw/example.json', 'w') as f:
    dump(property_metadata, f)




In [28]:
import pandas as pd
example_data_pd = pd.read_json('../data/raw/example.json').T

In [29]:
example_data_pd

Unnamed: 0,name,cost_text,coordinates,rooms,desc
https://www.domain.com.au/667-glenhuntly-road-caulfield-vic-3162-11598047,667 Glenhuntly Road Caulfield VIC 3162,"$38,000 p.a. Incl. Outgoings + GST","[-37.8860233, 145.0173065]",[],Can you hear it? Opportunity knocking!!
https://www.domain.com.au/upstairs-2c-staley-street-brunswick-vic-3056-13168913,Upstairs 2C Staley Street Brunswick VIC 3056,"$35,000 Annually","[-37.7655919, 144.9633048]","[0 Beds, 0 Baths, 2 Parking]",1st floor offices/studios\nEasy walking distan...
https://www.domain.com.au/level-3-302-13-15-lake-street-caroline-springs-vic-3023-15994395,"Level 3, 302/13-15 Lake Street Caroline Spring...",4125000 pw,"[-37.7316459, 144.7446886]",[],- Centrally located with tranquil views.\n- Le...
https://www.domain.com.au/3502-14-16-the-esplanade-st-kilda-vic-3182-16002767,3502/14-16 The Esplanade St Kilda VIC 3182,"$6,000","[-37.8650177, 144.9746821]","[3 Beds, 3 Baths, 3 Parking]",Inspired by the interplay of timeless design a...
https://www.domain.com.au/9-lansdowne-street-blairgowrie-vic-3942-12127675,9 Lansdowne Street Blairgowrie VIC 3942,"$5,000 per week","[-38.372703, 144.7856897]","[3 Beds, 2 Baths, 3 Parking]",Phone enquiry code for this property : 2751
...,...,...,...,...,...
https://www.domain.com.au/12-45-marine-parade-st-kilda-vic-3182-16059741,12/45 Marine Parade St Kilda VIC 3182,$790.00,"[-37.874449, 144.977025]","[2 Beds, 2 Baths, 2 Parking]",Louisa Corke
https://www.domain.com.au/6-5-woorigoleen-rd-toorak-vic-3142-16066950,6/5 Woorigoleen Rd Toorak VIC 3142,$790.00,"[-37.8429886, 145.0169363]","[3 Beds, 2 Baths, 2 Parking]","class=""css-dxogle"">* Unverified feature<svg a..."
https://www.domain.com.au/1a-1b-101-103-main-road-west-st-albans-vic-3021-10621148,1A & 1B/101-103 Main Road West St Albans VIC 3021,From $786 - $1572 per week,"[-37.756201, 144.811242]","[1 Bed, 1 Bath, 9 Parking]",Total floor area 878m2.\nNett office area 780m...
https://www.domain.com.au/3-132-nicholson-street-brunswick-east-vic-3057-16064390,3/132 Nicholson Street Brunswick East VIC 3057,$785.00,"[-37.7670198, 144.980567]","[3 Beds, 2 Baths, 2 Parking]",An opportunity exists to lease what is arguabl...
