In [6]:
## METHOD 1: convert dictionary to spark dataframe and append to initialized sdf
# built-in imports
import re
from json import dump, load
from tqdm import tqdm
from collections import defaultdict
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import pandas as pd  
import os
# Import Spark modules
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Domain Scraper") \
    .getOrCreate()

#### create a spark data frame

#### create a spark data frame 

# constants
BASE_URL = "https://www.domain.com.au"
N_PAGES = range(1, 2)  # Update this to your liking

#Scrape suburb from the address
def extract_suburb(address: str) -> str:
    """Extract the suburb name from the property address."""
    match = re.search(r'(?<=, )\w+', address)
    if match:
        return match.group(0)
    return "Unknown"


def start_scrape() -> None:
    """ Function that scrapes https://www.domain.com.au and outputs the data into a json file

    Parameters:
    None

    Returns:
    None
    """
    schema = StructType([
    StructField("url", StringType(), True),
    StructField("name", StringType(), True),
    StructField("cost_text", StringType(), True),
    StructField("rooms", StringType(), True),
    StructField("parking", StringType(), True),
    StructField("street", StringType(), True),
    StructField("suburb", StringType(), True),
    StructField("postcode", StringType(), True),
    StructField("propertyType", StringType(), True),
    StructField("school", StringType(), True),
    StructField("features", StringType(), True),
    ])

    # begin code
    url_links = []
    property_metadata = defaultdict(dict)
    sdf = spark.createDataFrame([],schema)
    

    # generate list of urls to visit
    for page in N_PAGES:
        url = BASE_URL + f"/rent/?excludedeposittaken=1&state=vic&page={page}"
        print(f"Visiting {url}")
        bs_object = BeautifulSoup(urlopen(Request(url, headers={'User-Agent': "PostmanRuntime/7.6.0"})), "lxml")

        # find the unordered list (ul) elements which are the results, then
        # find all href (a) tags that are from the base_url website.
        index_links = bs_object.find("ul", {"data-testid": "results"}).findAll(
            "a", href=re.compile(f"{BASE_URL}/*")  # the `*` denotes wildcard any
        )

        for link in index_links:
            # if it's a property address, add it to the list
            if 'address' in link.get('class', []):
                url_links.append(link['href'])

    # for each url, scrape some basic metadata
    pbar = tqdm(url_links)
    success_count, total_count = 0, 0

    for property_url in pbar:
        try:
            bs_object = BeautifulSoup(urlopen(Request(property_url, headers={'User-Agent': "PostmanRuntime/7.6.0"})), "lxml")
            total_count += 1

            # looks for the header class to get property name
            property_metadata[property_url]['name'] = bs_object.find("h1", {"class": "css-164r41r"}).text.strip()

            # looks for the div containing a summary title for cost
            property_metadata[property_url]['cost_text'] = bs_object.find(
                "div", {"data-testid": "listing-details__summary-title"}
            ).text.strip()

            # get rooms and parking
            rooms = bs_object.find("div", {"data-testid": "property-features"}).findAll(
                "span", {"data-testid": "property-features-text-container"}
            )

            # rooms
            property_metadata[property_url]['rooms'] = ", ".join(
                [re.findall(r'\d+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'Bed' in feature.text or 'Bath' in feature.text]
            )

            # parking
            property_metadata[property_url]['parking'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'Parking' in feature.text]
            )
            """
            # listingID:
            property_metadata[property_url]['listingid'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'listingId' in feature.text]
            )
            """

            # street:
            property_metadata[property_url]['street'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'street' in feature.text]
            )

            # suburb:
            # suburb:
            property_metadata[property_url]['suburb'] = extract_suburb(property_metadata[property_url]['name'])

            
            # postcode:
            property_metadata[property_url]['postcode'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'postcode' in feature.text]
            )

            # property type:
            property_metadata[property_url]['propertyType'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'apartment' in feature.text 
                 or 'unit' in feature.text or 'house' in feature.text or 'flat' in feature.text]
            )

            # schools:
            property_metadata[property_url]['school'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'school' in feature.text]
            )

            # features:
            property_metadata[property_url]['features'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'feature' in feature.text]
            )

           
            """
            # Write each row to the CSV
            writer.writerow([
                property_url,
                property_metadata[property_url]['name'],
                property_metadata[property_url]['cost_text'],
                property_metadata[property_url]['rooms'],
                property_metadata[property_url]['parking'],
                property_metadata[property_url]['desc'],
                property_metadata[property_url]['listingid'],
                property_metadata[property_url]['street'],
                property_metadata[property_url]['suburb'],
                property_metadata[property_url]['postcode'],
                property_metadata[property_url]['propertyType'],
                property_metadata[property_url]['school'],
                property_metadata[property_url]['features'],
                property_metadata[property_url]['loan'],
                property_metadata[property_url]['listingsummary'],
                property_metadata[property_url]['suburbInsights']
            ])
            """
            success_count += 1
            temp_sdf = spark.createDataFrame(property_metadata)
            sdf.union(temp_sdf)

        except AttributeError:
            print(f"Issue with {property_url}")

        pbar.set_description(f"{(success_count / total_count * 100):.0f}% successful")

        # output to example json in data/raw/
    with open('../data/raw/example.json', 'w') as f:
        dump(property_metadata, f)

def convert_to_parquet(filepath: str, output_path: str) -> None:
    """ Function converts a json file into a parquet file

    Parameters:
    filepath (str): the filepath that locates our json data

    output_path (str): the filepath that we will place our new parquet file into

    Returns:
    None
    """
    with open(filepath) as f:
        data = load(f)

    new_data = change_json_format(data)

    # conversion from json -> dataframe -> parquet
    df = pd.DataFrame(new_data)
    df.to_parquet(output_path, engine='pyarrow')

    delete_json_file(filepath)

# function that changes the formatting of the json file
def change_json_format(data: dict) -> dict:
    """ Function grabs the renames the json keys to the words after the last backslash in the url and adds the url as an item

    Parameters:
    data (dict): json dictionary we are changing

    Returns:
    dict: our new json dictionary
    
    """
    new_data = {}
    for i in data.keys():
        new_name = i.rsplit('/', 1)[-1]
        new_data[new_name] = data[i]
        new_data[new_name]["href"] = i
    return new_data

def delete_json_file(filepath: str) -> None:
    """ Function deletes the json file we are converting from

    Parameters:
    filepath (string): filepath to the json file we are deleting

    Returns:
    None
    """
    try:
        os.remove(filepath)
        print(f"File '{filepath}' deleted successfully")
    except FileNotFoundError:
        print(f"File '{filepath}' not found")
    except PermissionError:
        print(f"Permission denied: '{filepath}'")
    except Exception as e:
        print(f"An error occurred: {e}")

24/09/08 22:58:20 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [7]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Tutorial 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

from pyarrow import json
import pyarrow.parquet as pq

24/09/08 22:58:20 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [8]:
#Method 2 : create a new spark df with the values that we have just read in. append it to the global sdf
# sdf was correctly initialised. Had problems with appending to the sdf
# built-in imports
import re
import requests
import csv
from json import dump, load
from tqdm import tqdm
from collections import defaultdict
import urllib.request
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import pandas as pd  
import pyarrow
import string
import os
import cchardet
import lxml

#### create a spark data frame 

# constants
BASE_URL = "https://www.domain.com.au"
N_PAGES = range(1, 2)  # Update this to your liking

from pyspark.sql.types import StructType, StructField, StringType
def start_scrape():
    """ Function that scrapes https://www.domain.com.au and outputs the data into a json file

    Parameters:
    None

    Returns:
    None
    """

    schema = StructType([
    StructField("url", StringType(), True),
    StructField("name", StringType(), True),
    StructField("cost_text", StringType(), True),
    StructField("rooms", StringType(), True),
    # StructField("parking", StringType(), True),
    # StructField("street", StringType(), True),
    # StructField("suburb", StringType(), True),
    # StructField("postcode", StringType(), True),
    # StructField("propertyType", StringType(), True),
    # StructField("school", StringType(), True),
    # StructField("features", StringType(), True),
    ])

    # begin code
    url_links = []
    property_metadata = spark.createDataFrame([],schema)

    # generate list of urls to visit
    for page in N_PAGES:
        url = BASE_URL + f"/rent/?excludedeposittaken=1&state=vic&page={page}"
        print(f"Visiting {url}")
        bs_object = BeautifulSoup(urlopen(Request(url, headers={'User-Agent': "PostmanRuntime/7.6.0"})), "lxml")

        # find the unordered list (ul) elements which are the results, then
        # find all href (a) tags that are from the base_url website.
        index_links = bs_object.find("ul", {"data-testid": "results"}).findAll(
            "a", href=re.compile(f"{BASE_URL}/*")  # the `*` denotes wildcard any
        )

        for link in index_links:
            # if it's a property address, add it to the list
            if 'address' in link.get('class', []):
                url_links.append(link['href'])

    # for each url, scrape some basic metadata
    pbar = tqdm(url_links)
    success_count, total_count = 0, 0

    for property_url in pbar:
        try:
            bs_object = BeautifulSoup(urlopen(Request(property_url, headers={'User-Agent': "PostmanRuntime/7.6.0"})), "lxml")
            total_count += 1

            # looks for the header class to get property name
            name = bs_object.find("h1", {"class": "css-164r41r"}).text.strip()
            print(name)

            # looks for the div containing a summary title for cost
            cost_text = bs_object.find("div", {"data-testid": "listing-details__summary-title"}).text.strip()

            # # get rooms and parking
            rooms = bs_object.find("div", {"data-testid": "property-features"}).findAll(
                "span", {"data-testid": "property-features-text-container"}
            )

            # rooms
            rooms= ", ".join([re.findall(r'\d+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'Bed' in feature.text or 'Bath' in feature.text])

            # # parking
            # parking = ", ".join(
            #     [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'Parking' in feature.text]
            # )
            """
            # listingID:
            property_metadata[property_url]['listingid'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'listingId' in feature.text]
            )
            """

            # # street:
            # street = ", ".join(
            #     [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'street' in feature.text]
            # )

            # # suburb:
            # suburb = extract_suburb[property_url]['name']
            
            # # postcode:
            # postcode = ", ".join(
            #     [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'postcode' in feature.text]
            # )

            # # property type:
            # propertyType = ", ".join(
            #     [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'apartment' in feature.text 
            #      or 'unit' in feature.text or 'house' in feature.text or 'flat' in feature.text]
            # )

            # # schools:
            # school = ", ".join(
            #     [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'school' in feature.text]
            # )

            # # features:
            # features = ", ".join(
            #     [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'feature' in feature.text]
            # )

            row = [(property_url, name, cost_text, rooms)]
                    #,parking,street, suburb, postcode,propertyType,school, features)]

            success_count += 1

            property_metadata = property_metadata.union(row)
        except AttributeError:
            print(f"Issue with {property_url}")

        pbar.set_description(f"{(success_count / total_count * 100):.0f}% successful")
        property_metadata.show()
        # Output to example JSON in data/raw/
    with open('../data/raw/work', 'w') as f:
        dump(property_metadata, f)


In [9]:
def convert_to_parquet(filepath: str, output_path: str) -> None:
    """ Function converts a JSON file into a parquet file """
    with open(filepath) as f:
        data = load(f)

    new_data = change_json_format(data)

    # Conversion from JSON -> DataFrame -> Parquet
    df = pd.DataFrame(new_data)
    df.to_parquet(output_path, engine='pyarrow')

    delete_json_file(filepath)

def change_json_format(data: dict) -> dict:
    """ Function renames JSON keys and adds the URL as an item """
    new_data = {}
    for i in data.keys():
        new_name = i.rsplit('/', 1)[-1]
        new_data[new_name] = data[i]
        new_data[new_name]["href"] = i
    return new_data

def delete_json_file(filepath: str) -> None:
    """ Function deletes the JSON file """
    try:
        os.remove(filepath)
        print(f"File '{filepath}' deleted successfully")
    except FileNotFoundError:
        print(f"File '{filepath}' not found")
    except PermissionError:
        print(f"Permission denied: '{filepath}'")
    except Exception as e:
        print(f"An error occurred: {e}")

In [10]:
# built-in imports
import re
import requests
import csv
from json import dump
from tqdm import tqdm
from collections import defaultdict
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

# Initialize Spark session
spark = SparkSession.builder.appName("PropertyScraper").getOrCreate()

# constants
BASE_URL = "https://www.domain.com.au"
N_PAGES = range(1, 2)  # Update this to your liking

def start_scrape():
    """Function that scrapes https://www.domain.com.au and outputs the data into a JSON file"""

    schema = StructType([
        StructField("url", StringType(), True),
        StructField("name", StringType(), True),
        StructField("cost_text", StringType(), True),
        StructField("rooms", StringType(), True),
    ])

    url_links = []
    property_metadata = spark.createDataFrame([], schema)

    # Generate list of URLs to visit
    for page in N_PAGES:
        url = BASE_URL + f"/rent/?excludedeposittaken=1&state=vic&page={page}"
        print(f"Visiting {url}")
        bs_object = BeautifulSoup(urlopen(Request(url, headers={'User-Agent': "PostmanRuntime/7.6.0"})), "lxml")

        # Find the unordered list (ul) elements which are the results, then
        # find all href (a) tags that are from the base_url website.
        index_links = bs_object.find("ul", {"data-testid": "results"}).findAll(
            "a", href=re.compile(f"{BASE_URL}/*")  # the `*` denotes wildcard any
        )

        for link in index_links:
            # If it's a property address, add it to the list
            if 'address' in link.get('class', []):
                url_links.append(link['href'])

    # For each URL, scrape some basic metadata
    pbar = tqdm(url_links)
    success_count, total_count = 0, 0

    for property_url in pbar:
        try:
            bs_object = BeautifulSoup(urlopen(Request(property_url, headers={'User-Agent': "PostmanRuntime/7.6.0"})), "lxml")
            total_count += 1

            # Get property name
            name = bs_object.find("h1", {"class": "css-164r41r"}).text.strip()

            # Get cost text
            cost_text = bs_object.find("div", {"data-testid": "listing-details__summary-title"}).text.strip()

            # Get rooms
            rooms = bs_object.find("div", {"data-testid": "property-features"}).findAll(
                "span", {"data-testid": "property-features-text-container"}
            )
            rooms = ", ".join([re.findall(r'\d+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'Bed' in feature.text or 'Bath' in feature.text])

            # Create a row and append it to the DataFrame
            row = [(property_url, name, cost_text, rooms)]
            row_df = spark.createDataFrame(row, schema)
            property_metadata = property_metadata.union(row_df)
            success_count += 1

        except AttributeError:
            print(f"Issue with {property_url}")

        pbar.set_description(f"{(success_count / total_count * 100):.0f}% successful")

    # Output to JSON file
    property_metadata.write.json('../data/raw/work.json', mode='overwrite')

# Start scraping
start_scrape()


24/09/08 22:58:20 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


Visiting https://www.domain.com.au/rent/?excludedeposittaken=1&state=vic&page=1


100% successful: 100%|██████████| 21/21 [00:15<00:00,  1.33it/s]
                                                                                

In [11]:
import pandas as pd
import os

# Define the folder path containing JSON files
folder_path = '../data/raw/work.json'

# List all files in the directory
json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]

# Initialize an empty list to store DataFrames
dataframes = []

# Read each JSON file into a DataFrame
for file in json_files:
    file_path = os.path.join(folder_path, file)
    # Read JSON file
    df = pd.read_json(file_path, lines=True)
    # Append the DataFrame to the list
    dataframes.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

# Display the combined DataFrame
print(combined_df.head())


                                                 url  \
0  https://www.domain.com.au/911-915-collins-stre...   
1  https://www.domain.com.au/8-perth-avenue-albio...   
2  https://www.domain.com.au/6-137-coppin-st-rich...   
3  https://www.domain.com.au/60-domain-street-sou...   
4  https://www.domain.com.au/9-aldgate-street-fra...   

                                         name      cost_text            rooms  
0  911/915 Collins Street, Docklands VIC 3008           $850  2 Beds, 2 Baths  
1             8 Perth Avenue, Albion VIC 3020           $460   3 Beds, 1 Bath  
2          6/137 Coppin St, Richmond VIC 3121    $425 weekly    1 Bed, 1 Bath  
3      60 Domain Street, South Yarra VIC 3141      $2,800.00  4 Beds, 4 Baths  
4      9 Aldgate Street, Fraser Rise VIC 3336  $540 PER WEEK  4 Beds, 2 Baths  
