In [37]:
## METHOD 1: convert dictionary to spark dataframe and append to initialized sdf
# built-in imports
import re
import requests
import csv
from json import dump, load
from tqdm import tqdm
from collections import defaultdict
import urllib.request
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import pandas as pd  
import pyarrow
import string
import os
import cchardet
import lxml

#### create a spark data frame 

# constants
BASE_URL = "https://www.domain.com.au"
N_PAGES = range(1, 2)  # Update this to your liking

#Scrape suburb from the address
def extract_suburb(address: str) -> str:
    """Extract the suburb name from the property address."""
    match = re.search(r'(?<=, )\w+', address)
    if match:
        return match.group(0)
    return "Unknown"


def start_scrape() -> None:
    """ Function that scrapes https://www.domain.com.au and outputs the data into a json file

    Parameters:
    None

    Returns:
    None
    """
    schema = StructType([
    StructField("url", StringType(), True),
    StructField("name", StringType(), True),
    StructField("cost_text", StringType(), True),
    StructField("rooms", StringType(), True),
    StructField("parking", StringType(), True),
    StructField("street", StringType(), True),
    StructField("suburb", StringType(), True),
    StructField("postcode", StringType(), True),
    StructField("propertyType", StringType(), True),
    StructField("school", StringType(), True),
    StructField("features", StringType(), True),
    ])

    # begin code
    url_links = []
    property_metadata = defaultdict(dict)
    sdf = spark.createDataFrame([],schema)
    

    # generate list of urls to visit
    for page in N_PAGES:
        url = BASE_URL + f"/rent/?excludedeposittaken=1&state=vic&page={page}"
        print(f"Visiting {url}")
        bs_object = BeautifulSoup(urlopen(Request(url, headers={'User-Agent': "PostmanRuntime/7.6.0"})), "lxml")

        # find the unordered list (ul) elements which are the results, then
        # find all href (a) tags that are from the base_url website.
        index_links = bs_object.find("ul", {"data-testid": "results"}).findAll(
            "a", href=re.compile(f"{BASE_URL}/*")  # the `*` denotes wildcard any
        )

        for link in index_links:
            # if it's a property address, add it to the list
            if 'address' in link.get('class', []):
                url_links.append(link['href'])

    # for each url, scrape some basic metadata
    pbar = tqdm(url_links)
    success_count, total_count = 0, 0

    for property_url in pbar:
        try:
            bs_object = BeautifulSoup(urlopen(Request(property_url, headers={'User-Agent': "PostmanRuntime/7.6.0"})), "lxml")
            total_count += 1

            # looks for the header class to get property name
            property_metadata[property_url]['name'] = bs_object.find("h1", {"class": "css-164r41r"}).text.strip()

            # looks for the div containing a summary title for cost
            property_metadata[property_url]['cost_text'] = bs_object.find(
                "div", {"data-testid": "listing-details__summary-title"}
            ).text.strip()

            # get rooms and parking
            rooms = bs_object.find("div", {"data-testid": "property-features"}).findAll(
                "span", {"data-testid": "property-features-text-container"}
            )

            # rooms
            property_metadata[property_url]['rooms'] = ", ".join(
                [re.findall(r'\d+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'Bed' in feature.text or 'Bath' in feature.text]
            )

            # parking
            property_metadata[property_url]['parking'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'Parking' in feature.text]
            )
            """
            # listingID:
            property_metadata[property_url]['listingid'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'listingId' in feature.text]
            )
            """

            # street:
            property_metadata[property_url]['street'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'street' in feature.text]
            )

            # suburb:
            property_metadata[property_url]['suburb'] = extract_suburb[property_url]['name']
            
            # postcode:
            property_metadata[property_url]['postcode'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'postcode' in feature.text]
            )

            # property type:
            property_metadata[property_url]['propertyType'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'apartment' in feature.text 
                 or 'unit' in feature.text or 'house' in feature.text or 'flat' in feature.text]
            )

            # schools:
            property_metadata[property_url]['school'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'school' in feature.text]
            )

            # features:
            property_metadata[property_url]['features'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'feature' in feature.text]
            )

           
            """
            # Write each row to the CSV
            writer.writerow([
                property_url,
                property_metadata[property_url]['name'],
                property_metadata[property_url]['cost_text'],
                property_metadata[property_url]['rooms'],
                property_metadata[property_url]['parking'],
                property_metadata[property_url]['desc'],
                property_metadata[property_url]['listingid'],
                property_metadata[property_url]['street'],
                property_metadata[property_url]['suburb'],
                property_metadata[property_url]['postcode'],
                property_metadata[property_url]['propertyType'],
                property_metadata[property_url]['school'],
                property_metadata[property_url]['features'],
                property_metadata[property_url]['loan'],
                property_metadata[property_url]['listingsummary'],
                property_metadata[property_url]['suburbInsights']
            ])
            """
            success_count += 1
            temp_sdf = spark.createDataFrame(property_metadata)
            sdf.union(temp_sdf)

        except AttributeError:
            print(f"Issue with {property_url}")

        pbar.set_description(f"{(success_count / total_count * 100):.0f}% successful")

        # output to example json in data/raw/
    with open('../data/raw/example.json', 'w') as f:
        dump(property_metadata, f)

def convert_to_parquet(filepath: str, output_path: str) -> None:
    """ Function converts a json file into a parquet file

    Parameters:
    filepath (str): the filepath that locates our json data

    output_path (str): the filepath that we will place our new parquet file into

    Returns:
    None
    """
    with open(filepath) as f:
        data = load(f)

    new_data = change_json_format(data)

    # conversion from json -> dataframe -> parquet
    df = pd.DataFrame(new_data)
    df.to_parquet(output_path, engine='pyarrow')

    delete_json_file(filepath)

# function that changes the formatting of the json file
def change_json_format(data: dict) -> dict:
    """ Function grabs the renames the json keys to the words after the last backslash in the url and adds the url as an item

    Parameters:
    data (dict): json dictionary we are changing

    Returns:
    dict: our new json dictionary
    
    """
    new_data = {}
    for i in data.keys():
        new_name = i.rsplit('/', 1)[-1]
        new_data[new_name] = data[i]
        new_data[new_name]["href"] = i
    return new_data

def delete_json_file(filepath: str) -> None:
    """ Function deletes the json file we are converting from

    Parameters:
    filepath (string): filepath to the json file we are deleting

    Returns:
    None
    """
    try:
        os.remove(filepath)
        print(f"File '{filepath}' deleted successfully")
    except FileNotFoundError:
        print(f"File '{filepath}' not found")
    except PermissionError:
        print(f"Permission denied: '{filepath}'")
    except Exception as e:
        print(f"An error occurred: {e}")

In [38]:
start_scrape()

Visiting https://www.domain.com.au/rent/?excludedeposittaken=1&state=vic&page=1


  0%|          | 0/21 [00:00<?, ?it/s]


TypeError: 'function' object is not subscriptable

In [1]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Tutorial 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

from pyarrow import json
import pyarrow.parquet as pq

your 131072x1 screen size is bogus. expect trouble
24/09/05 18:35:03 WARN Utils: Your hostname, DESKTOP-Q5SP5SI resolves to a loopback address: 127.0.1.1; using 172.20.36.110 instead (on interface eth0)
24/09/05 18:35:03 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/05 18:35:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [40]:
#Method 2 : create a new spark df with the values that we have just read in. append it to the global sdf
# sdf was correctly initialised. Had problems with appending to the sdf
# built-in imports
import re
import requests
import csv
from json import dump, load
from tqdm import tqdm
from collections import defaultdict
import urllib.request
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import pandas as pd  
import pyarrow
import string
import os
import cchardet
import lxml

#### create a spark data frame 

# constants
BASE_URL = "https://www.domain.com.au"
N_PAGES = range(1, 2)  # Update this to your liking

from pyspark.sql.types import StructType, StructField, StringType
def start_scrape():
    """ Function that scrapes https://www.domain.com.au and outputs the data into a json file

    Parameters:
    None

    Returns:
    None
    """

    schema = StructType([
    StructField("url", StringType(), True),
    StructField("name", StringType(), True),
    StructField("cost_text", StringType(), True),
    StructField("rooms", StringType(), True),
    StructField("parking", StringType(), True),
    StructField("street", StringType(), True),
    StructField("suburb", StringType(), True),
    StructField("postcode", StringType(), True),
    StructField("propertyType", StringType(), True),
    StructField("school", StringType(), True),
    StructField("features", StringType(), True),
    ])

    # begin code
    url_links = []
    property_metadata = spark.createDataFrame([],schema)

    # generate list of urls to visit
    for page in N_PAGES:
        url = BASE_URL + f"/rent/?excludedeposittaken=1&state=vic&page={page}"
        print(f"Visiting {url}")
        bs_object = BeautifulSoup(urlopen(Request(url, headers={'User-Agent': "PostmanRuntime/7.6.0"})), "lxml")

        # find the unordered list (ul) elements which are the results, then
        # find all href (a) tags that are from the base_url website.
        index_links = bs_object.find("ul", {"data-testid": "results"}).findAll(
            "a", href=re.compile(f"{BASE_URL}/*")  # the `*` denotes wildcard any
        )

        for link in index_links:
            # if it's a property address, add it to the list
            if 'address' in link.get('class', []):
                url_links.append(link['href'])

    # for each url, scrape some basic metadata
    pbar = tqdm(url_links)
    success_count, total_count = 0, 0

    for property_url in pbar:
        try:
            bs_object = BeautifulSoup(urlopen(Request(property_url, headers={'User-Agent': "PostmanRuntime/7.6.0"})), "lxml")
            total_count += 1

            # looks for the header class to get property name
            name = bs_object.find("h1", {"class": "css-164r41r"}).text.strip()
            print(name)

            # looks for the div containing a summary title for cost
            cost_text = bs_object.find("div", {"data-testid": "listing-details__summary-title"}).text.strip()

            # # get rooms and parking
            rooms = bs_object.find("div", {"data-testid": "property-features"}).findAll(
                "span", {"data-testid": "property-features-text-container"}
            )

            # rooms
            rooms= ", ".join([re.findall(r'\d+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'Bed' in feature.text or 'Bath' in feature.text])

            # parking
            parking = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'Parking' in feature.text]
            )
            """
            # listingID:
            property_metadata[property_url]['listingid'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'listingId' in feature.text]
            )
            """

            # street:
            street = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'street' in feature.text]
            )

            # suburb:
            suburb = extract_suburb[property_url]['name']
            
            # postcode:
            postcode = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'postcode' in feature.text]
            )

            # property type:
            propertyType = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'apartment' in feature.text 
                 or 'unit' in feature.text or 'house' in feature.text or 'flat' in feature.text]
            )

            # schools:
            school = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'school' in feature.text]
            )

            # features:
            features = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'feature' in feature.text]
            )

            row = [(property_url, name, cost_text, rooms,parking,street, suburb, postcode,propertyType,school, features)]
            temp_df = spark.createDataFrame(row, schema)
            temp_df.limit(1)

            success_count += 1

            property_metadata = property_metadata.union(new_row)
        except AttributeError:
            print(f"Issue with {property_url}")

        pbar.set_description(f"{(success_count / total_count * 100):.0f}% successful")
    property_metadata.show()


In [28]:
start_scrape()

Visiting https://www.domain.com.au/rent/?excludedeposittaken=1&state=vic&page=1


0% successful:   5%|▍         | 1/21 [00:01<00:22,  1.14s/it]

27/125 Ormond Road, Elwood VIC 3184
Issue with https://www.domain.com.au/27-125-ormond-road-elwood-vic-3184-17177089?topspot=1


0% successful:  10%|▉         | 2/21 [00:01<00:18,  1.04it/s]

1/27 Crisp Street, Hampton VIC 3188
Issue with https://www.domain.com.au/1-27-crisp-street-hampton-vic-3188-17195662


0% successful:  14%|█▍        | 3/21 [00:03<00:20,  1.12s/it]

45 Byron Street, Elwood VIC 3184
Issue with https://www.domain.com.au/45-byron-street-elwood-vic-3184-17195551


0% successful:  19%|█▉        | 4/21 [00:04<00:20,  1.18s/it]

8/50 Disraeli Street, Kew VIC 3101
Issue with https://www.domain.com.au/8-50-disraeli-street-kew-vic-3101-17195544


0% successful:  24%|██▍       | 5/21 [00:05<00:17,  1.06s/it]

26/418 St Kilda Road, Melbourne VIC 3000
Issue with https://www.domain.com.au/26-418-st-kilda-road-melbourne-vic-3000-17195504


0% successful:  29%|██▊       | 6/21 [00:06<00:15,  1.01s/it]

209/2 Queen Street, Blackburn VIC 3130
Issue with https://www.domain.com.au/209-2-queen-street-blackburn-vic-3130-17195423


0% successful:  33%|███▎      | 7/21 [00:07<00:13,  1.03it/s]

504/62 Carlisle Street, St Kilda VIC 3182
Issue with https://www.domain.com.au/504-62-carlisle-street-st-kilda-vic-3182-15715168


0% successful:  38%|███▊      | 8/21 [00:08<00:12,  1.03it/s]

1/31 Victoria Street, Box Hill VIC 3128
Issue with https://www.domain.com.au/1-31-victoria-street-box-hill-vic-3128-17195262


0% successful:  43%|████▎     | 9/21 [00:09<00:12,  1.00s/it]

1/19-21 Dalgety Street, St Kilda VIC 3182
Issue with https://www.domain.com.au/1-19-21-dalgety-street-st-kilda-vic-3182-17195208


0% successful:  48%|████▊     | 10/21 [00:10<00:10,  1.02it/s]

46 Evans Street, Port Melbourne VIC 3207
Issue with https://www.domain.com.au/46-evans-street-port-melbourne-vic-3207-17195200


0% successful:  52%|█████▏    | 11/21 [00:11<00:09,  1.03it/s]

1417/35-47 Coventry Street, Southbank VIC 3006
Issue with https://www.domain.com.au/1417-35-47-coventry-street-southbank-vic-3006-17195173


0% successful:  57%|█████▋    | 12/21 [00:12<00:08,  1.04it/s]

17/168 Power Street, Hawthorn VIC 3122
Issue with https://www.domain.com.au/17-168-power-street-hawthorn-vic-3122-17195151


0% successful:  62%|██████▏   | 13/21 [00:13<00:08,  1.11s/it]

2/10 Park Ave, Richmond VIC 3121
Issue with https://www.domain.com.au/2-10-park-ave-richmond-vic-3121-17195137


0% successful:  67%|██████▋   | 14/21 [00:14<00:08,  1.21s/it]

3002B/11 Rose Lane,, Melbourne VIC 3000
Issue with https://www.domain.com.au/3002b-11-rose-lane-melbourne-vic-3000-17195136


0% successful:  71%|███████▏  | 15/21 [00:15<00:06,  1.13s/it]

1/17 Dunoon Street, Murrumbeena VIC 3163
Issue with https://www.domain.com.au/1-17-dunoon-street-murrumbeena-vic-3163-17195125


0% successful:  76%|███████▌  | 16/21 [00:17<00:07,  1.40s/it]

106/389 Neerim Rd, Carnegie VIC 3163
Issue with https://www.domain.com.au/106-389-neerim-rd-carnegie-vic-3163-17195102


0% successful:  81%|████████  | 17/21 [00:18<00:05,  1.26s/it]

92 Clarendon Street, Keysborough VIC 3173
Issue with https://www.domain.com.au/92-clarendon-street-keysborough-vic-3173-17195094


0% successful:  86%|████████▌ | 18/21 [00:20<00:04,  1.35s/it]

2/557 Lower Dandenong Road, Dingley Village VIC 3172
Issue with https://www.domain.com.au/2-557-lower-dandenong-road-dingley-village-vic-3172-17195058


0% successful:  90%|█████████ | 19/21 [00:21<00:02,  1.26s/it]

3712/80 ABeckett Street, Melbourne VIC 3000
Issue with https://www.domain.com.au/3712-80-abeckett-street-melbourne-vic-3000-17195040


0% successful:  95%|█████████▌| 20/21 [00:25<00:02,  2.12s/it]

6/488 Barkers Road, Hawthorn VIC 3122
Issue with https://www.domain.com.au/6-488-barkers-road-hawthorn-vic-3122-17194996


0% successful: 100%|██████████| 21/21 [00:26<00:00,  1.29s/it]

3 Shady Mews, Clayton VIC 3168
Issue with https://www.domain.com.au/3-shady-mews-clayton-vic-3168-17194941



[Stage 5:>                                                          (0 + 7) / 7]

+---+----+---------+-----+-------+------+------+--------+------------+------+--------+
|url|name|cost_text|rooms|parking|street|suburb|postcode|propertyType|school|features|
+---+----+---------+-----+-------+------+------+--------+------------+------+--------+
+---+----+---------+-----+-------+------+------+--------+------------+------+--------+



                                                                                