In [7]:
"""
A very simple and basic web scraping script. Feel free to
use this as a source of inspiration, but, make sure to attribute
it if you do so.

This is by no means production code.
"""
# built-in imports
import re
from json import dump

from collections import defaultdict

# user packages
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests

# constants
BASE_URL = "https://www.domain.com.au"
N_PAGES = range(1, 2) # update this to your liking

headers = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"}

In [8]:
# begin code
url_links = []
property_metadata = defaultdict(dict)

# generate list of urls to visit
for page in N_PAGES:
    url = BASE_URL + f"/rent/melbourne-region-vic/?sort=dateupdated-desc&page={page}" # a single page
    bs_object = BeautifulSoup(requests.get(url, headers=headers).text, "html.parser") # makes bs object

    # find the unordered list (ul) elements which are the results, then
    # find all href (a) tags that are from the base_url website.
    index_links = bs_object \
        .find(
            "ul",
            {"data-testid": "results"}
        ) \
        .findAll(
            "a",
            href=re.compile(f"{BASE_URL}/*") # complies RE string into RE expression 
        )

    for link in index_links:
        # if its a property address, add it to the list
        if 'address' in link['class']:
            url_links.append(link['href'])

print(url_links[0:5])

['https://www.domain.com.au/109-1-turner-street-abbotsford-vic-3067-16068516', 'https://www.domain.com.au/4-133-glenroy-road-glenroy-vic-3046-16068512', 'https://www.domain.com.au/52-brunswick-crescent-craigieburn-vic-3064-16068510', 'https://www.domain.com.au/7-187-gillies-street-fairfield-vic-3078-16068508', 'https://www.domain.com.au/2-1417-pascoe-vale-road-meadow-heights-vic-3048-16068500']


In [9]:
# for each url, scrape some basic metadata
for property_url in url_links[0:5]:
    bs_object = BeautifulSoup(requests.get(property_url, headers=headers).text, "html.parser")

    # looks for the header class to get property name
    property_metadata[property_url]['name'] = bs_object \
        .find("h1", {"class": "css-164r41r"}) \
        .text

    # may need preprocessing 
    cost_finder = re.compile(r'[0-9]+.?[0-9]+')
    # looks for the div containing a summary title for cost
    cost_text = bs_object \
        .find("div", {"data-testid": "listing-details__summary-title"}) \
        .text

    # extracts the cost from the summary title
    cost = cost_finder.search(cost_text)
    if cost == None: 
        property_metadata[property_url]['cost'] = 0
    else:
         property_metadata[property_url]['cost'] = cost.group()
        
    # extract coordinates from the hyperlink provided
    # finds latitude and longitude from integrated Google Map
    property_metadata[property_url]['coordinates'] = [
        float(coord) for coord in re.findall(
            r'destination=([-\s,\d\.]+)', # use regex101.com here if you need to
            bs_object \
                .find(
                    "a",
                    {"target": "_blank", 'rel': "noopener noreferer"}
                ) \
                .attrs['href']
        )[0].split(',')
    ]
    
    # extracts # of bedrooms, # of baths and # of parking spots 
    rooms_info = bs_object.find("div", {"data-testid": "property-features"}).findAll("span", {"data-testid": "property-features-text-container"})
    for i in range(0, len(rooms_info)):
        attr_desc = str(rooms_info[i].text).split(' ')

        property_metadata[property_url][attr_desc[1]] = attr_desc[0]

    # extracts desciption from the site -> perhaps can use bag of words to find words that link to more expensive rental units 
    property_metadata[property_url]['desc'] = re \
        .sub(r'<br\/>', '\n', str(bs_object.find("p"))) \
        .strip('</p>')

    

In [10]:
property_metadata

defaultdict(dict,
            {'https://www.domain.com.au/109-1-turner-street-abbotsford-vic-3067-16068516': {'name': '109/1 Turner Street Abbotsford VIC 3067',
              'cost': '420.00',
              'coordinates': [-37.7992725, 144.995338],
              'Bed': '1',
              'Bath': '1',
              'Parking': '−',
              'desc': 'All Abbotsford has to offer \nThis building enjoys the wonderful natural light and great location with parklands, shops/cafes and public transport in close proximity. The spacious apartment offers a large bedroom with built in robes, separate bathroom with double shower, European laundry, open plan living and kitchen area with electric cooking appliances. Additional features also include private'},
             'https://www.domain.com.au/4-133-glenroy-road-glenroy-vic-3046-16068512': {'name': '4/133 Glenroy Road Glenroy VIC 3046',
              'cost': '400',
              'coordinates': [-37.706251, 144.924806],
              'Beds': '2

In [5]:
# output to example json in data/raw/
with open('../data/raw/example.json', 'w') as f:
    dump(property_metadata, f)

