## Scrape data from domain website

In [1]:
# built-in imports
import re
from json import dump

from collections import defaultdict

# user packages
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests

## Scrape data sorted by highest price

In [2]:
# constants
BASE_URL = "https://www.domain.com.au"
N_PAGES = range(1, 51) # update this to your liking

# begin code
url_links = []
property_metadata = defaultdict(dict)

# generate list of urls to visit
for page in N_PAGES:
    headers = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"}
    url = BASE_URL + f"/rent/vic/?sort=price-desc&page={page}"
    bs_object = BeautifulSoup(requests.get(url, headers=headers).text, "html.parser")

    # find the unordered list (ul) elements which are the results, then
    # find all href (a) tags that are from the base_url website.
    index_links = bs_object \
        .find(
            "ul",
            {"data-testid": "results"}
        ) \
        .findAll(
            "a",
            href=re.compile(f"{BASE_URL}/*") # the `*` denotes wildcard any
        )

    for link in index_links:
        # if its a property address, add it to the list
        if 'address' in link['class']:
            url_links.append(link['href'])

In [3]:
# for each url, scrape some basic metadata
for property_url in url_links[1:]:
    bs_object = BeautifulSoup(requests.get(property_url, headers=headers).text, "html.parser")

    # looks for the header class to get property name
    property_metadata[property_url]['name'] = bs_object \
        .find("h1", {"class": "css-164r41r"}) \
        .text

    # looks for the div containing a summary title for cost
    property_metadata[property_url]['cost_text'] = bs_object \
        .find("div", {"data-testid": "listing-details__summary-title"}) \
        .text

    # looks for the div containing the number of bed, bathroom and parking area
    property_metadata[property_url]['features'] = bs_object \
        .find("div", {"data-testid": "property-features-wrapper"}) \
        .text
    
    # looks for the div containing the type of property
    property_metadata[property_url]['type'] = bs_object \
        .find("div", {"data-testid": "listing-summary-property-type"}) \
        .text
    
    # extract coordinates from the hyperlink provided
    property_metadata[property_url]['coordinates'] = [
        float(coord) for coord in re.findall(
            r'destination=([-\s,\d\.]+)', # use regex101.com here if you need to
            bs_object \
                .find(
                    "a",
                    {"target": "_blank", 'rel': "noopener noreferer"}
                ) \
                .attrs['href']
        )[0].split(',')
    ]

    #property_metadata[property_url]['rooms'] = [
    #    re.findall(r'\d\s[A-Za-z]+', feature.text)[0] for feature in bs_object \
    #        .find("div", {"data-testid": "property-features"}) \
    #        .findAll("span", {"data-testid": "property-features-text-container"})
    #]

    property_metadata[property_url]['desc'] = re \
        .sub(r'<br\/>', '\n', str(bs_object.find("p"))) \
        .strip('</p >')

# output to example json in data/raw/
with open('../data/raw/highest_price.json', 'w') as f:
    dump(property_metadata, f)

## Scrape data sorted by Feature

In [4]:
# constants
BASE_URL = "https://www.domain.com.au"
N_PAGES = range(1, 51) # update this to your liking

# begin code
url_links = []
property_metadata = defaultdict(dict)

# generate list of urls to visit
for page in N_PAGES:
    headers = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"}
    url = BASE_URL + f"/rent/vic/?sort=default-desc&page={page}"
    bs_object = BeautifulSoup(requests.get(url, headers=headers).text, "html.parser")

    # find the unordered list (ul) elements which are the results, then
    # find all href (a) tags that are from the base_url website.
    index_links = bs_object \
        .find(
            "ul",
            {"data-testid": "results"}
        ) \
        .findAll(
            "a",
            href=re.compile(f"{BASE_URL}/*") # the `*` denotes wildcard any
        )

    for link in index_links:
        # if its a property address, add it to the list
        if 'address' in link['class']:
            url_links.append(link['href'])

In [5]:
# for each url, scrape some basic metadata
for property_url in url_links[1:]:
    bs_object = BeautifulSoup(requests.get(property_url, headers=headers).text, "html.parser")

    # looks for the header class to get property name
    property_metadata[property_url]['name'] = bs_object \
        .find("h1", {"class": "css-164r41r"}) \
        .text

    # looks for the div containing a summary title for cost
    property_metadata[property_url]['cost_text'] = bs_object \
        .find("div", {"data-testid": "listing-details__summary-title"}) \
        .text

    # looks for the div containing the number of bed, bathroom and parking area
    property_metadata[property_url]['features'] = bs_object \
        .find("div", {"data-testid": "property-features-wrapper"}) \
        .text
    
    # looks for the div containing the type of property
    property_metadata[property_url]['type'] = bs_object \
        .find("div", {"data-testid": "listing-summary-property-type"}) \
        .text
    
    # extract coordinates from the hyperlink provided
    property_metadata[property_url]['coordinates'] = [
        float(coord) for coord in re.findall(
            r'destination=([-\s,\d\.]+)', # use regex101.com here if you need to
            bs_object \
                .find(
                    "a",
                    {"target": "_blank", 'rel': "noopener noreferer"}
                ) \
                .attrs['href']
        )[0].split(',')
    ]

    #property_metadata[property_url]['rooms'] = [
    #    re.findall(r'\d\s[A-Za-z]+', feature.text)[0] for feature in bs_object \
    #        .find("div", {"data-testid": "property-features"}) \
    #        .findAll("span", {"data-testid": "property-features-text-container"})
    #]

    property_metadata[property_url]['desc'] = re \
        .sub(r'<br\/>', '\n', str(bs_object.find("p"))) \
        .strip('</p >')

# output to example json in data/raw/
with open('../data/raw/feature.json', 'w') as f:
    dump(property_metadata, f)

## Scrape data sorted by Lowest price

In [6]:
# constants
BASE_URL = "https://www.domain.com.au"
N_PAGES = range(1, 51) # update this to your liking

# begin code
url_links = []
property_metadata = defaultdict(dict)

# generate list of urls to visit
for page in N_PAGES:
    headers = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"}
    url = BASE_URL + f"/rent/vic/?sort=price-asc&page={page}"
    bs_object = BeautifulSoup(requests.get(url, headers=headers).text, "html.parser")

    # find the unordered list (ul) elements which are the results, then
    # find all href (a) tags that are from the base_url website.
    index_links = bs_object \
        .find(
            "ul",
            {"data-testid": "results"}
        ) \
        .findAll(
            "a",
            href=re.compile(f"{BASE_URL}/*") # the `*` denotes wildcard any
        )

    for link in index_links:
        # if its a property address, add it to the list
        if 'address' in link['class']:
            url_links.append(link['href'])

In [7]:
# for each url, scrape some basic metadata
for property_url in url_links[1:]:
    bs_object = BeautifulSoup(requests.get(property_url, headers=headers).text, "html.parser")

    # looks for the header class to get property name
    property_metadata[property_url]['name'] = bs_object \
        .find("h1", {"class": "css-164r41r"}) \
        .text

    # looks for the div containing a summary title for cost
    property_metadata[property_url]['cost_text'] = bs_object \
        .find("div", {"data-testid": "listing-details__summary-title"}) \
        .text

    # looks for the div containing the number of bed, bathroom and parking area
    property_metadata[property_url]['features'] = bs_object \
        .find("div", {"data-testid": "property-features-wrapper"}) \
        .text
    
    # looks for the div containing the type of property
    property_metadata[property_url]['type'] = bs_object \
        .find("div", {"data-testid": "listing-summary-property-type"}) \
        .text
    
    # extract coordinates from the hyperlink provided
    property_metadata[property_url]['coordinates'] = [
        float(coord) for coord in re.findall(
            r'destination=([-\s,\d\.]+)', # use regex101.com here if you need to
            bs_object \
                .find(
                    "a",
                    {"target": "_blank", 'rel': "noopener noreferer"}
                ) \
                .attrs['href']
        )[0].split(',')
    ]

    #property_metadata[property_url]['rooms'] = [
    #    re.findall(r'\d\s[A-Za-z]+', feature.text)[0] for feature in bs_object \
    #        .find("div", {"data-testid": "property-features"}) \
    #        .findAll("span", {"data-testid": "property-features-text-container"})
    #]

    property_metadata[property_url]['desc'] = re \
        .sub(r'<br\/>', '\n', str(bs_object.find("p"))) \
        .strip('</p >')

# output to example json in data/raw/
with open('../data/raw/lowest_prices.json', 'w') as f:
    dump(property_metadata, f)

## Scrape data sorted by Newest

In [8]:
# constants
BASE_URL = "https://www.domain.com.au"
N_PAGES = range(1, 51) # update this to your liking

# begin code
url_links = []
property_metadata = defaultdict(dict)

# generate list of urls to visit
for page in N_PAGES:
    headers = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"}
    url = BASE_URL + f"/rent/vic/?sort=dateupdated-desc&page={page}"
    bs_object = BeautifulSoup(requests.get(url, headers=headers).text, "html.parser")

    # find the unordered list (ul) elements which are the results, then
    # find all href (a) tags that are from the base_url website.
    index_links = bs_object \
        .find(
            "ul",
            {"data-testid": "results"}
        ) \
        .findAll(
            "a",
            href=re.compile(f"{BASE_URL}/*") # the `*` denotes wildcard any
        )

    for link in index_links:
        # if its a property address, add it to the list
        if 'address' in link['class']:
            url_links.append(link['href'])

In [9]:
# for each url, scrape some basic metadata
for property_url in url_links[1:]:
    bs_object = BeautifulSoup(requests.get(property_url, headers=headers).text, "html.parser")

    # looks for the header class to get property name
    property_metadata[property_url]['name'] = bs_object \
        .find("h1", {"class": "css-164r41r"}) \
        .text

    # looks for the div containing a summary title for cost
    property_metadata[property_url]['cost_text'] = bs_object \
        .find("div", {"data-testid": "listing-details__summary-title"}) \
        .text

    # looks for the div containing the number of bed, bathroom and parking area
    property_metadata[property_url]['features'] = bs_object \
        .find("div", {"data-testid": "property-features-wrapper"}) \
        .text
    
    # looks for the div containing the type of property
    property_metadata[property_url]['type'] = bs_object \
        .find("div", {"data-testid": "listing-summary-property-type"}) \
        .text
    
    # extract coordinates from the hyperlink provided
    property_metadata[property_url]['coordinates'] = [
        float(coord) for coord in re.findall(
            r'destination=([-\s,\d\.]+)', # use regex101.com here if you need to
            bs_object \
                .find(
                    "a",
                    {"target": "_blank", 'rel': "noopener noreferer"}
                ) \
                .attrs['href']
        )[0].split(',')
    ]

    #property_metadata[property_url]['rooms'] = [
    #    re.findall(r'\d\s[A-Za-z]+', feature.text)[0] for feature in bs_object \
    #        .find("div", {"data-testid": "property-features"}) \
    #        .findAll("span", {"data-testid": "property-features-text-container"})
    #]

    property_metadata[property_url]['desc'] = re \
        .sub(r'<br\/>', '\n', str(bs_object.find("p"))) \
        .strip('</p >')

# output to example json in data/raw/
with open('../data/raw/newest.json', 'w') as f:
    dump(property_metadata, f)

## Scrape data sorted by Suburb

In [10]:
# constants
BASE_URL = "https://www.domain.com.au"
N_PAGES = range(1, 51) # update this to your liking

# begin code
url_links = []
property_metadata = defaultdict(dict)

# generate list of urls to visit
for page in N_PAGES:
    headers = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"}
    url = BASE_URL + f"/rent/vic/?sort=suburb-asc&page={page}"
    bs_object = BeautifulSoup(requests.get(url, headers=headers).text, "html.parser")

    # find the unordered list (ul) elements which are the results, then
    # find all href (a) tags that are from the base_url website.
    index_links = bs_object \
        .find(
            "ul",
            {"data-testid": "results"}
        ) \
        .findAll(
            "a",
            href=re.compile(f"{BASE_URL}/*") # the `*` denotes wildcard any
        )

    for link in index_links:
        # if its a property address, add it to the list
        if 'address' in link['class']:
            url_links.append(link['href'])

In [11]:
# for each url, scrape some basic metadata
for property_url in url_links[1:]:
    bs_object = BeautifulSoup(requests.get(property_url, headers=headers).text, "html.parser")

    # looks for the header class to get property name
    property_metadata[property_url]['name'] = bs_object \
        .find("h1", {"class": "css-164r41r"}) \
        .text

    # looks for the div containing a summary title for cost
    property_metadata[property_url]['cost_text'] = bs_object \
        .find("div", {"data-testid": "listing-details__summary-title"}) \
        .text

    # looks for the div containing the number of bed, bathroom and parking area
    property_metadata[property_url]['features'] = bs_object \
        .find("div", {"data-testid": "property-features-wrapper"}) \
        .text
    
    # looks for the div containing the type of property
    property_metadata[property_url]['type'] = bs_object \
        .find("div", {"data-testid": "listing-summary-property-type"}) \
        .text
    
    # extract coordinates from the hyperlink provided
    property_metadata[property_url]['coordinates'] = [
        float(coord) for coord in re.findall(
            r'destination=([-\s,\d\.]+)', # use regex101.com here if you need to
            bs_object \
                .find(
                    "a",
                    {"target": "_blank", 'rel': "noopener noreferer"}
                ) \
                .attrs['href']
        )[0].split(',')
    ]

    #property_metadata[property_url]['rooms'] = [
    #    re.findall(r'\d\s[A-Za-z]+', feature.text)[0] for feature in bs_object \
    #        .find("div", {"data-testid": "property-features"}) \
    #        .findAll("span", {"data-testid": "property-features-text-container"})
    #]

    property_metadata[property_url]['desc'] = re \
        .sub(r'<br\/>', '\n', str(bs_object.find("p"))) \
        .strip('</p >')

# output to example json in data/raw/
with open('../data/raw/suburb.json', 'w') as f:
    dump(property_metadata, f)