In [24]:
## METHOD 1: convert dictionary to spark dataframe and append to initialized sdf
# built-in imports
import re
from json import dump, load
from tqdm import tqdm
from collections import defaultdict
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import pandas as pd  
import os
# Import Spark modules
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Domain Scraper") \
    .getOrCreate()

#### create a spark data frame

# constants
BASE_URL = "https://www.domain.com.au"
N_PAGES = range(1, 2)  # Update this to your liking

#Scrape suburb from the address
def extract_suburb(address: str) -> str:
    """Extract the suburb name from the property address."""
    match = re.search(r'(?<=, )\w+', address)
    if match:
        return match.group(0)
    return "Unknown"


def start_scrape() -> None:
    """ Function that scrapes https://www.domain.com.au and outputs the data into a json file

    Parameters:
    None

    Returns:
    None
    """
    schema = StructType([
    StructField("url", StringType(), True),
    StructField("name", StringType(), True),
    StructField("cost_text", StringType(), True),
    StructField("rooms", StringType(), True),
    StructField("desc", StringType(), True),
    StructField("parking", StringType(), True),
    StructField("street", StringType(), True),
    StructField("suburb", StringType(), True),
    StructField("postcode", StringType(), True),
    StructField("propertyType", StringType(), True),
    StructField("school", StringType(), True),
    StructField("features", StringType(), True),
    ])

    # begin code
    url_links = []
    property_metadata = defaultdict(dict)
    sdf = spark.createDataFrame([],schema)
    

    # generate list of urls to visit
    for page in N_PAGES:
        url = BASE_URL + f"/rent/?excludedeposittaken=1&state=vic&page={page}"
        print(f"Visiting {url}")
        bs_object = BeautifulSoup(urlopen(Request(url, headers={'User-Agent': "PostmanRuntime/7.6.0"})), "lxml")

        # find the unordered list (ul) elements which are the results, then
        # find all href (a) tags that are from the base_url website.
        index_links = bs_object.find("ul", {"data-testid": "results"}).findAll(
            "a", href=re.compile(f"{BASE_URL}/*")  # the `*` denotes wildcard any
        )

        for link in index_links:
            # if it's a property address, add it to the list
            if 'address' in link.get('class', []):
                url_links.append(link['href'])

    # for each url, scrape some basic metadata
    pbar = tqdm(url_links)
    success_count, total_count = 0, 0

    for property_url in pbar:
        try:
            bs_object = BeautifulSoup(urlopen(Request(property_url, headers={'User-Agent': "PostmanRuntime/7.6.0"})), "lxml")
            total_count += 1

            property_page = urlopen(Request(property_url, headers={'User-Agent': "PostmanRuntime/7.6.0"}))
            property_soup = BeautifulSoup(property_page, "lxml")

            # looks for the header class to get property name
            property_metadata[property_url]['name'] = bs_object.find("h1", {"class": "css-164r41r"}).text.strip()

            # looks for the div containing a summary title for cost
            property_metadata[property_url]['cost_text'] = bs_object.find(
                "div", {"data-testid": "listing-details__summary-title"}
            ).text.strip()


            # get rooms and parking
            rooms = bs_object.find("div", {"data-testid": "property-features"}).findAll(
                "span", {"data-testid": "property-features-text-container"}
            )

            # rooms
            property_metadata[property_url]['rooms'] = ", ".join(
                [re.findall(r'\d+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'Bed' in feature.text or 'Bath' in feature.text]
            )

            # parking
            property_metadata[property_url]['parking'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'Parking' in feature.text]
            )

            # desc
            property_metadata[property_url]['desc'] = ", ".join(
                [re.findall(r'\d+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'desc' in feature.text]
            )
            
            # listingID:
            property_metadata[property_url]['listingid'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'listingId' in feature.text]
            )

            # street:
            property_metadata[property_url]['street'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'street' in feature.text]
            )

            # suburb:
            property_metadata[property_url]['suburb'] = extract_suburb(property_metadata[property_url]['name'])

            
            # postcode:
            property_metadata[property_url]['postcode'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'postcode' in feature.text]
            )

            # property type:
            property_metadata[property_url]['propertyType'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'apartment' in feature.text 
                 or 'unit' in feature.text or 'house' in feature.text or 'flat' in feature.text]
            )

            # schools:
            property_metadata[property_url]['school'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'school' in feature.text]
            )

            # features:
            property_metadata[property_url]['features'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'feature' in feature.text]
            )

            # loanfinder:
            property_metadata[property_url]['loan'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'loan' in feature.text]
            )

            # listingSummary:
            property_metadata[property_url]['listingsummary'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'summary' in feature.text]
            )

            # suburb insights:
            property_metadata[property_url]['suburbInsights'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'suburbInsights' in feature.text]
            )

            # property description
            property_metadata[property_url]['desc'] = bs_object.find("p").text.strip() if bs_object.find("p") else "N/A"


            # Scrape property description
            property_metadata[property_url]['desc'] = re.sub(r'<br\/>', '\n', str(property_soup.find("p"))).strip('</p>')
           
            """
            # Write each row to the CSV
            writer.writerow([
                property_url,
                property_metadata[property_url]['name'],
                property_metadata[property_url]['cost_text'],
                property_metadata[property_url]['rooms'],
                property_metadata[property_url]['parking'],
                property_metadata[property_url]['desc'],
                property_metadata[property_url]['listingid'],
                property_metadata[property_url]['street'],
                property_metadata[property_url]['suburb'],
                property_metadata[property_url]['postcode'],
                property_metadata[property_url]['propertyType'],
                property_metadata[property_url]['school'],
                property_metadata[property_url]['features'],
                property_metadata[property_url]['loan'],
                property_metadata[property_url]['listingsummary'],
                property_metadata[property_url]['suburbInsights']
            ])
            """
            success_count += 1
            temp_sdf = spark.createDataFrame(property_metadata)
            sdf.union(temp_sdf)

        except AttributeError:
            print(f"Issue with {property_url}")

        pbar.set_description(f"{(success_count / total_count * 100):.0f}% successful")

        # output to example json in data/raw/
    with open('../data/raw/example.json', 'w') as f:
        dump(property_metadata, f)

def convert_to_parquet(filepath: str, output_path: str) -> None:
    """ Function converts a json file into a parquet file

    Parameters:
    filepath (str): the filepath that locates our json data

    output_path (str): the filepath that we will place our new parquet file into

    Returns:
    None
    """
    with open(filepath) as f:
        data = load(f)

    new_data = change_json_format(data)

    # conversion from json -> dataframe -> parquet
    df = pd.DataFrame(new_data)
    df.to_parquet(output_path, engine='pyarrow')

    delete_json_file(filepath)

# function that changes the formatting of the json file
def change_json_format(data: dict) -> dict:
    """ Function grabs the renames the json keys to the words after the last backslash in the url and adds the url as an item

    Parameters:
    data (dict): json dictionary we are changing

    Returns:
    dict: our new json dictionary
    
    """
    new_data = {}
    for i in data.keys():
        new_name = i.rsplit('/', 1)[-1]
        new_data[new_name] = data[i]
        new_data[new_name]["href"] = i
    return new_data

def delete_json_file(filepath: str) -> None:
    """ Function deletes the json file we are converting from

    Parameters:
    filepath (string): filepath to the json file we are deleting

    Returns:
    None
    """
    try:
        os.remove(filepath)
        print(f"File '{filepath}' deleted successfully")
    except FileNotFoundError:
        print(f"File '{filepath}' not found")
    except PermissionError:
        print(f"Permission denied: '{filepath}'")
    except Exception as e:
        print(f"An error occurred: {e}")

24/09/11 11:24:41 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [1]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

from pyarrow import json
import pyarrow.parquet as pq

your 131072x1 screen size is bogus. expect trouble
24/09/12 10:55:12 WARN Utils: Your hostname, DESKTOP-RBVA59Q resolves to a loopback address: 127.0.1.1; using 172.26.88.196 instead (on interface eth0)
24/09/12 10:55:12 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/12 10:55:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [1]:
def convert_to_parquet(filepath: str, output_path: str) -> None:
    """ Function converts a JSON file into a parquet file """
    with open(filepath) as f:
        data = load(f)

    new_data = change_json_format(data)

    # Conversion from JSON -> DataFrame -> Parquet
    df = pd.DataFrame(new_data)
    df.to_parquet(output_path, engine='pyarrow')

    delete_json_file(filepath)

def change_json_format(data: dict) -> dict:
    """ Function renames JSON keys and adds the URL as an item """
    new_data = {}
    for i in data.keys():
        new_name = i.rsplit('/', 1)[-1]
        new_data[new_name] = data[i]
        new_data[new_name]["href"] = i
    return new_data

def delete_json_file(filepath: str) -> None:
    """ Function deletes the JSON file """
    try:
        os.remove(filepath)
        print(f"File '{filepath}' deleted successfully")
    except FileNotFoundError:
        print(f"File '{filepath}' not found")
    except PermissionError:
        print(f"Permission denied: '{filepath}'")
    except Exception as e:
        print(f"An error occurred: {e}")
    
def get_chunks(suburbs_df) -> dict:
    """function that splits up postcodes into chunks of 50 so that if we are kicked halfway during scraping we don't lose too much progress
    """
    i = 3050
    j = 3000
    chunk_dict = {}
    while i < 4000:
        temp = suburbs_df[suburbs_df['postcode'] >= j]
        chunk_dict['chunk_{}'.format(i)] = temp[temp['postcode'] < i]
        i += 50
        j += 50
    return chunk_dict

1. Run cell above
2. Run cell below 
3. Run cell below the cell below
4. Run property_metadata.write

In [2]:
# Working METHOD
import re
from json import dump
from tqdm import tqdm
from collections import defaultdict
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType
import pandas as pd

# Initialize Spark session
spark = SparkSession.builder.master('local[*]') \
    .config("spark.driver.memory", "15g") \
    .config("spark.executor.memory", "16g") \
    .appName("PropertyScraper") \
    .getOrCreate()

# Constants
BASE_URL = "https://www.domain.com.au"
N_PAGES = range(1, 50)  # Max number of pages you want to scrape

# Load suburbs CSV
suburbs_df = pd.read_csv('postcodes.csv')  # Ensure this CSV contains 'suburb' and 'postcode' columns
chunk_dict = get_chunks(suburbs_df)

def start_scrape(chunk, file_suffix):
    """Function that scrapes https://www.domain.com.au and outputs the data into a JSON file
    
    parameters:
    chunk: chunk of 50 postcodes we will scrape
    file_suffix: what we want to title the end of our files when we write to json
    """

    # Define schema for the Spark DataFrame
    schema = StructType([
        StructField("url", StringType(), True),
        StructField("postcode", StringType(), True),
        StructField("suburb", StringType(), True),
        StructField("name", StringType(), True),
        StructField("cost_text", StringType(), True),
        StructField("beds", StringType(), True),  # Separate field for beds
        StructField("baths", StringType(), True),  # Separate field for baths
        StructField("parking", StringType(), True),  # Parking field
        StructField("property_type", StringType(), True),  # Property type field
    ])


    # Initialize an empty DataFrame with the schema
    property_metadata = spark.createDataFrame([], schema)

    # Loop through each suburb and its postcode
    for index, row in chunk.iterrows():
        suburb = row['locality'].lower().replace(' ', '-')  # Convert to lowercase and hyphenate
        postcode = row['postcode']

        print(f"Scraping data for {suburb} ({postcode})")

        url_links = []
        page_found = False  # This flag will help us track whether any results are found

        # Generate list of URLs to visit
        for page in N_PAGES:
            url = BASE_URL + f"/rent/{suburb}-vic-{postcode}/?ssubs=0&sort=suburb-asc&page={page}"
            try:
                bs_object = BeautifulSoup(urlopen(Request(url, headers={'User-Agent': "PostmanRuntime/7.6.0"})), "lxml")

                # Check if the page has results or shows "No results found"
                no_results = bs_object.find(text=re.compile("No results found", re.I))
                if no_results:
                    print(f"No results found for {suburb} on page {page}. Stopping further scraping for this suburb.")
                    break  # Exit the pagination loop if no results are found

                # Find property links
                index_links = bs_object.find("ul", {"data-testid": "results"})
                if not index_links:
                    print(f"No more results for {suburb} on page {page}.")
                    break  # Exit pagination if no results list is found (end of pages)

                index_links = index_links.findAll("a", href=re.compile(f"{BASE_URL}/*"))
                page_found = True  # At least one result was found on this page

                for link in index_links:
                    # If it's a property address, add it to the list
                    if 'address' in link.get('class', []):
                        url_links.append(link['href'])

            except Exception as e:
                print(f"Error fetching {url}: {e}")
                break  # Stop if there's an issue with fetching the page

        if not page_found:
            print(f"No results for {suburb}. Moving to the next suburb.")
            continue  # Skip to the next suburb if no pages were found for this one

        # For each URL, scrape some basic metadata
        pbar = tqdm(url_links)
        success_count, total_count = 0, 0

        for property_url in pbar:
            try:
                bs_object = BeautifulSoup(urlopen(Request(property_url, headers={'User-Agent': "PostmanRuntime/7.6.0"})), "lxml")
                total_count += 1

                # Get property name
                name = bs_object.find("h1", {"class": "css-164r41r"}).text.strip()

                # Get cost text
                cost_text = bs_object.find("div", {"data-testid": "listing-details__summary-title"}).text.strip()

                # Get rooms (beds and baths)
                rooms = bs_object.find("div", {"data-testid": "property-features"}).findAll(
                    "span", {"data-testid": "property-features-text-container"}
                )

                # Initialize variables
                beds, baths, parking = None, None, '0'  # Default value for parking is '0 Car'

                for feature in rooms:
                    text = feature.text
                    if 'Bed' in text:
                        beds_match = re.findall(r'\d+', text)
                        if beds_match:
                            beds = beds_match[0]  # Extract the number of beds
                    elif 'Bath' in text:
                        baths_match = re.findall(r'\d+', text)
                        if baths_match:
                            baths = baths_match[0]  # Extract the number of baths
                    elif 'Car' in text or 'Parking' in text:
                        parking_match = re.findall(r'\d+', text)
                        if parking_match:
                            parking = parking_match[0]  # Extract the number of parking spaces

                property_type_container = bs_object.find("div", {"data-testid": "listing-summary-property-type"})
                property_type = property_type_container.get_text(strip=True)

                # Create a row and append it to the DataFrame
                row = [(property_url, postcode, suburb, name, cost_text, beds, baths, parking, property_type)]
                row_df = spark.createDataFrame(row, schema)
                property_metadata = property_metadata.union(row_df)
                success_count += 1

            except AttributeError:
                print(f"Error scraping {property_url}: missing data")

            pbar.set_description(f"{(success_count / total_count * 100):.0f}% successful")

        # Show the DataFrame to ensure data is being appended
        #property_metadata.show()

    # Output to parquet file
    #try:
     #   property_metadata.write.mode("overwrite").json('../data/raw/work_{}.json'.format(file_suffix))
      #  print(f"Data successfully written")
    #except Exception as e:
     #   print(f"An error occured: {e}")

    #added this print statement so that the cell output can be scrollable - it's getting annoying to click the scroll bar >:(
    print("chunk finished")
    return property_metadata



your 131072x1 screen size is bogus. expect trouble
24/09/13 10:56:06 WARN Utils: Your hostname, DESKTOP-RBVA59Q resolves to a loopback address: 127.0.1.1; using 172.26.88.196 instead (on interface eth0)
24/09/13 10:56:06 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/13 10:56:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# start scraping by chunks of 50
#for i in chunk_dict:
 #   start_scrape(chunk_dict[i], i.split("_")[1])
property_metadata = start_scrape(chunk_dict['chunk_3050'], '3050')

Scraping data for melbourne (3000)


  no_results = bs_object.find(text=re.compile("No results found", re.I))


No more results for melbourne on page 45.


100% successful: 100%|██████████| 874/874 [12:43<00:00,  1.14it/s]


Scraping data for melbourne (3001)
Error fetching https://www.domain.com.au/rent/melbourne-vic-3001/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for melbourne. Moving to the next suburb.
Scraping data for east-melbourne (3002)
No more results for east-melbourne on page 3.


100% successful: 100%|██████████| 37/37 [00:31<00:00,  1.18it/s]


Scraping data for west-melbourne (3003)
No more results for west-melbourne on page 6.


100% successful: 100%|██████████| 100/100 [01:26<00:00,  1.15it/s]


Scraping data for melbourne (3004)
No more results for melbourne on page 6.


100% successful: 100%|██████████| 94/94 [01:25<00:00,  1.10it/s]


Scraping data for st-kilda-road-central (3004)
Error fetching https://www.domain.com.au/rent/st-kilda-road-central-vic-3004/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for st-kilda-road-central. Moving to the next suburb.
Scraping data for st-kilda-road-melbourne (3004)
Error fetching https://www.domain.com.au/rent/st-kilda-road-melbourne-vic-3004/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for st-kilda-road-melbourne. Moving to the next suburb.
Scraping data for world-trade-centre (3005)
Error fetching https://www.domain.com.au/rent/world-trade-centre-vic-3005/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for world-trade-centre. Moving to the next suburb.
Scraping data for south-wharf (3006)
No more results for south-wharf on page 1.
No results for south-wharf. Moving to the next suburb.
Scraping data for southbank (3006)
No more results for southbank on page 21.


100% successful: 100%|██████████| 398/398 [05:32<00:00,  1.20it/s]


Scraping data for docklands (3008)
No more results for docklands on page 11.


100% successful: 100%|██████████| 199/199 [02:41<00:00,  1.23it/s]


Scraping data for university-of-melbourne (3010)
Error fetching https://www.domain.com.au/rent/university-of-melbourne-vic-3010/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for university-of-melbourne. Moving to the next suburb.
Scraping data for footscray (3011)
No more results for footscray on page 9.


100% successful: 100%|██████████| 143/143 [01:59<00:00,  1.20it/s]


Scraping data for seddon (3011)
No more results for seddon on page 2.


100% successful: 100%|██████████| 15/15 [00:16<00:00,  1.07s/it]


Scraping data for seddon-west (3011)
Error fetching https://www.domain.com.au/rent/seddon-west-vic-3011/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for seddon-west. Moving to the next suburb.
Scraping data for brooklyn (3012)
No more results for brooklyn on page 2.


100% successful: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]


Scraping data for kingsville (3012)
No more results for kingsville on page 2.


100% successful: 100%|██████████| 7/7 [00:04<00:00,  1.72it/s]


Scraping data for kingsville-west (3012)
Error fetching https://www.domain.com.au/rent/kingsville-west-vic-3012/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for kingsville-west. Moving to the next suburb.
Scraping data for maidstone (3012)
No more results for maidstone on page 3.


100% successful: 100%|██████████| 34/34 [00:26<00:00,  1.27it/s]


Scraping data for tottenham (3012)
No more results for tottenham on page 1.
No results for tottenham. Moving to the next suburb.
Scraping data for west-footscray (3012)
No more results for west-footscray on page 3.


100% successful: 100%|██████████| 34/34 [00:34<00:00,  1.01s/it]


Scraping data for yarraville (3013)
No more results for yarraville on page 4.


100% successful: 100%|██████████| 56/56 [00:49<00:00,  1.14it/s]


Scraping data for yarraville-west (3013)
Error fetching https://www.domain.com.au/rent/yarraville-west-vic-3013/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for yarraville-west. Moving to the next suburb.
Scraping data for newport (3015)
No more results for newport on page 3.


100% successful: 100%|██████████| 26/26 [00:20<00:00,  1.25it/s]


Scraping data for south-kingsville (3015)
No more results for south-kingsville on page 2.


100% successful: 100%|██████████| 11/11 [00:10<00:00,  1.00it/s]


Scraping data for spotswood (3015)
No more results for spotswood on page 2.


100% successful: 100%|██████████| 12/12 [00:10<00:00,  1.11it/s]


Scraping data for williamstown (3016)
No more results for williamstown on page 4.


100% successful: 100%|██████████| 43/43 [00:36<00:00,  1.17it/s]


Scraping data for williamstown-north (3016)
No more results for williamstown-north on page 2.


100% successful: 100%|██████████| 4/4 [00:06<00:00,  1.60s/it]


Scraping data for altona (3018)
No more results for altona on page 3.


100% successful: 100%|██████████| 26/26 [00:21<00:00,  1.21it/s]


Scraping data for seaholme (3018)
No more results for seaholme on page 2.


100% successful: 100%|██████████| 1/1 [00:00<00:00,  1.09it/s]


Scraping data for braybrook (3019)
No more results for braybrook on page 3.


100% successful: 100%|██████████| 22/22 [00:17<00:00,  1.27it/s]


Scraping data for braybrook-north (3019)
Error fetching https://www.domain.com.au/rent/braybrook-north-vic-3019/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for braybrook-north. Moving to the next suburb.
Scraping data for robinson (3019)
Error fetching https://www.domain.com.au/rent/robinson-vic-3019/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for robinson. Moving to the next suburb.
Scraping data for albion (3020)
No more results for albion on page 2.


100% successful: 100%|██████████| 20/20 [00:14<00:00,  1.37it/s]


Scraping data for glengala (3020)
Error fetching https://www.domain.com.au/rent/glengala-vic-3020/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for glengala. Moving to the next suburb.
Scraping data for sunshine (3020)
No more results for sunshine on page 2.


100% successful: 100%|██████████| 19/19 [00:16<00:00,  1.15it/s]


Scraping data for sunshine-north (3020)
No more results for sunshine-north on page 2.


100% successful: 100%|██████████| 13/13 [00:09<00:00,  1.33it/s]


Scraping data for sunshine-west (3020)
No more results for sunshine-west on page 3.


100% successful: 100%|██████████| 26/26 [00:19<00:00,  1.34it/s]


Scraping data for albanvale (3021)
No more results for albanvale on page 2.


100% successful: 100%|██████████| 3/3 [00:02<00:00,  1.34it/s]


Scraping data for kealba (3021)
No more results for kealba on page 2.


100% successful: 100%|██████████| 2/2 [00:01<00:00,  1.67it/s]


Scraping data for kings-park (3021)
No more results for kings-park on page 2.


100% successful: 100%|██████████| 6/6 [00:04<00:00,  1.42it/s]


Scraping data for st-albans (3021)
No more results for st-albans on page 4.


100% successful: 100%|██████████| 41/41 [00:31<00:00,  1.28it/s]


Scraping data for ardeer (3022)
No more results for ardeer on page 2.


100% successful: 100%|██████████| 5/5 [00:04<00:00,  1.14it/s]


Scraping data for deer-park-east (3022)
Error fetching https://www.domain.com.au/rent/deer-park-east-vic-3022/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for deer-park-east. Moving to the next suburb.
Scraping data for burnside (3023)
No more results for burnside on page 2.


100% successful: 100%|██████████| 3/3 [00:02<00:00,  1.48it/s]


Scraping data for burnside-heights (3023)
No more results for burnside-heights on page 2.


100% successful: 100%|██████████| 4/4 [00:06<00:00,  1.55s/it]


Scraping data for cairnlea (3023)
No more results for cairnlea on page 2.


100% successful: 100%|██████████| 5/5 [00:03<00:00,  1.60it/s]


Scraping data for caroline-springs (3023)
No more results for caroline-springs on page 3.


100% successful: 100%|██████████| 24/24 [00:19<00:00,  1.20it/s]


Scraping data for deer-park (3023)
No more results for deer-park on page 2.


100% successful: 100%|██████████| 10/10 [00:07<00:00,  1.32it/s]


Scraping data for deer-park-north (3023)
Error fetching https://www.domain.com.au/rent/deer-park-north-vic-3023/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for deer-park-north. Moving to the next suburb.
Scraping data for ravenhall (3023)
No more results for ravenhall on page 1.
No results for ravenhall. Moving to the next suburb.
Scraping data for fieldstone (3024)
No more results for fieldstone on page 1.
No results for fieldstone. Moving to the next suburb.
Scraping data for mambourin (3024)
No more results for mambourin on page 3.


100% successful: 100%|██████████| 28/28 [00:23<00:00,  1.21it/s]


Scraping data for manor-lakes (3024)
No more results for manor-lakes on page 5.


100% successful: 100%|██████████| 67/67 [00:55<00:00,  1.20it/s]


Scraping data for mount-cottrell (3024)
No more results for mount-cottrell on page 2.


100% successful: 100%|██████████| 1/1 [00:02<00:00,  2.26s/it]


Scraping data for wyndham-vale (3024)
No more results for wyndham-vale on page 7.


100% successful: 100%|██████████| 101/101 [01:20<00:00,  1.26it/s]


Scraping data for altona-east (3025)
No more results for altona-east on page 1.
No results for altona-east. Moving to the next suburb.
Scraping data for altona-gate (3025)
Error fetching https://www.domain.com.au/rent/altona-gate-vic-3025/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for altona-gate. Moving to the next suburb.
Scraping data for altona-north (3025)
No more results for altona-north on page 3.


100% successful: 100%|██████████| 35/35 [00:29<00:00,  1.21it/s]


Scraping data for derrimut (3026)
No more results for derrimut on page 2.


100% successful: 100%|██████████| 8/8 [00:07<00:00,  1.10it/s]


Scraping data for laverton-north (3026)
No more results for laverton-north on page 1.
No results for laverton-north. Moving to the next suburb.
Scraping data for laverton-raaf (3027)
Error fetching https://www.domain.com.au/rent/laverton-raaf-vic-3027/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for laverton-raaf. Moving to the next suburb.
Scraping data for williams-landing (3027)
No more results for williams-landing on page 3.


100% successful: 100%|██████████| 27/27 [00:26<00:00,  1.03it/s]


Scraping data for williams-raaf (3027)
Error fetching https://www.domain.com.au/rent/williams-raaf-vic-3027/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for williams-raaf. Moving to the next suburb.
Scraping data for altona-meadows (3028)
No more results for altona-meadows on page 2.


100% successful: 100%|██████████| 17/17 [00:24<00:00,  1.46s/it]


Scraping data for laverton (3028)
No more results for laverton on page 2.


100% successful: 100%|██████████| 20/20 [00:13<00:00,  1.49it/s]


Scraping data for seabrook (3028)
No more results for seabrook on page 2.


100% successful: 100%|██████████| 6/6 [00:03<00:00,  1.52it/s]


Scraping data for hoppers-crossing (3029)
No more results for hoppers-crossing on page 5.


100% successful: 100%|██████████| 64/64 [00:53<00:00,  1.20it/s]


Scraping data for tarneit (3029)
No more results for tarneit on page 11.


100% successful: 100%|██████████| 188/188 [02:37<00:00,  1.19it/s]


Scraping data for truganina (3029)
No more results for truganina on page 10.


100% successful: 100%|██████████| 165/165 [02:29<00:00,  1.10it/s]


Scraping data for chartwell (3030)
Error fetching https://www.domain.com.au/rent/chartwell-vic-3030/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for chartwell. Moving to the next suburb.
Scraping data for cocoroc (3030)
No more results for cocoroc on page 1.
No results for cocoroc. Moving to the next suburb.
Scraping data for point-cook (3030)
No more results for point-cook on page 8.


100% successful: 100%|██████████| 139/139 [01:51<00:00,  1.25it/s]


Scraping data for quandong (3030)
No more results for quandong on page 1.
No results for quandong. Moving to the next suburb.
Scraping data for werribee (3030)
No more results for werribee on page 7.


100% successful: 100%|██████████| 112/112 [01:37<00:00,  1.14it/s]


Scraping data for werribee-south (3030)
No more results for werribee-south on page 2.


100% successful: 100%|██████████| 12/12 [00:09<00:00,  1.31it/s]


Scraping data for flemington (3031)
No more results for flemington on page 3.


100% successful: 100%|██████████| 28/28 [00:27<00:00,  1.02it/s]


Scraping data for kensington (3031)
No more results for kensington on page 4.


100% successful: 100%|██████████| 43/43 [00:37<00:00,  1.14it/s]


Scraping data for ascot-vale (3032)
No more results for ascot-vale on page 3.


100% successful: 100%|██████████| 37/37 [00:34<00:00,  1.08it/s]


Scraping data for highpoint-city (3032)
Error fetching https://www.domain.com.au/rent/highpoint-city-vic-3032/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for highpoint-city. Moving to the next suburb.
Scraping data for maribyrnong (3032)
No more results for maribyrnong on page 4.


100% successful: 100%|██████████| 49/49 [00:50<00:00,  1.03s/it]


Scraping data for travancore (3032)
No more results for travancore on page 2.


100% successful: 100%|██████████| 14/14 [00:12<00:00,  1.16it/s]


Scraping data for keilor-east (3033)
No more results for keilor-east on page 3.


100% successful: 100%|██████████| 24/24 [00:19<00:00,  1.22it/s]


Scraping data for avondale-heights (3034)
No more results for avondale-heights on page 3.


100% successful: 100%|██████████| 29/29 [00:27<00:00,  1.06it/s]


Scraping data for keilor (3036)
No more results for keilor on page 2.


100% successful: 100%|██████████| 3/3 [00:02<00:00,  1.05it/s]


Scraping data for keilor-north (3036)
No more results for keilor-north on page 1.
No results for keilor-north. Moving to the next suburb.
Scraping data for calder-park (3037)
No more results for calder-park on page 1.
No results for calder-park. Moving to the next suburb.
Scraping data for delahey (3037)
No more results for delahey on page 2.


100% successful: 100%|██████████| 5/5 [00:05<00:00,  1.00s/it]


Scraping data for hillside (3037)
No more results for hillside on page 2.


100% successful: 100%|██████████| 9/9 [00:08<00:00,  1.09it/s]


Scraping data for plumpton (3037)
Error fetching https://www.domain.com.au/rent/plumpton-vic-3037/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for plumpton. Moving to the next suburb.
Scraping data for sydenham (3037)
No more results for sydenham on page 2.


100% successful: 100%|██████████| 11/11 [00:09<00:00,  1.19it/s]


Scraping data for taylors-hill (3037)
No more results for taylors-hill on page 2.


100% successful: 100%|██████████| 4/4 [00:04<00:00,  1.06s/it]


Scraping data for keilor-downs (3038)
No more results for keilor-downs on page 2.


100% successful: 100%|██████████| 8/8 [00:06<00:00,  1.27it/s]


Scraping data for keilor-lodge (3038)
No more results for keilor-lodge on page 2.


100% successful: 100%|██████████| 1/1 [00:00<00:00,  1.11it/s]


Scraping data for taylors-lakes (3038)
No more results for taylors-lakes on page 2.


100% successful: 100%|██████████| 4/4 [00:04<00:00,  1.06s/it]


Scraping data for watergardens (3038)
Error fetching https://www.domain.com.au/rent/watergardens-vic-3038/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for watergardens. Moving to the next suburb.
Scraping data for moonee-ponds (3039)
No more results for moonee-ponds on page 3.


100% successful: 100%|██████████| 40/40 [00:43<00:00,  1.08s/it]


Scraping data for aberfeldie (3040)
No more results for aberfeldie on page 2.


100% successful: 100%|██████████| 9/9 [00:09<00:00,  1.03s/it]


Scraping data for essendon (3040)
No more results for essendon on page 5.


100% successful: 100%|██████████| 70/70 [01:05<00:00,  1.07it/s]


Scraping data for essendon-west (3040)
No more results for essendon-west on page 2.


100% successful: 100%|██████████| 6/6 [00:04<00:00,  1.21it/s]


Scraping data for cross-keys (3041)
Error fetching https://www.domain.com.au/rent/cross-keys-vic-3041/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for cross-keys. Moving to the next suburb.
Scraping data for essendon-fields (3041)
No more results for essendon-fields on page 1.
No results for essendon-fields. Moving to the next suburb.
Scraping data for essendon-north (3041)
No more results for essendon-north on page 2.


100% successful: 100%|██████████| 16/16 [00:17<00:00,  1.07s/it]


Scraping data for strathmore (3041)
No more results for strathmore on page 2.


100% successful: 100%|██████████| 14/14 [00:16<00:00,  1.21s/it]


Scraping data for strathmore-heights (3041)
No more results for strathmore-heights on page 2.


100% successful: 100%|██████████| 1/1 [00:00<00:00,  1.15it/s]


Scraping data for airport-west (3042)
No more results for airport-west on page 3.


100% successful: 100%|██████████| 22/22 [00:27<00:00,  1.25s/it]


Scraping data for keilor-park (3042)
No more results for keilor-park on page 2.


100% successful: 100%|██████████| 1/1 [00:00<00:00,  1.07it/s]


Scraping data for niddrie (3042)
No more results for niddrie on page 2.


100% successful: 100%|██████████| 11/11 [00:12<00:00,  1.14s/it]


Scraping data for niddrie-north (3042)
Error fetching https://www.domain.com.au/rent/niddrie-north-vic-3042/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for niddrie-north. Moving to the next suburb.
Scraping data for gladstone-park (3043)
No more results for gladstone-park on page 2.


100% successful: 100%|██████████| 7/7 [00:09<00:00,  1.37s/it]


Scraping data for gowanbrae (3043)
No more results for gowanbrae on page 2.


100% successful: 100%|██████████| 2/2 [00:01<00:00,  1.24it/s]


Scraping data for tullamarine (3043)
No more results for tullamarine on page 3.


100% successful: 100%|██████████| 23/23 [00:22<00:00,  1.03it/s]


Scraping data for pascoe-vale (3044)
No more results for pascoe-vale on page 3.


100% successful: 100%|██████████| 40/40 [00:40<00:00,  1.01s/it]


Scraping data for pascoe-vale-south (3044)
No more results for pascoe-vale-south on page 2.


100% successful: 100%|██████████| 12/12 [00:12<00:00,  1.00s/it]


Scraping data for melbourne-airport (3045)
No more results for melbourne-airport on page 1.
No results for melbourne-airport. Moving to the next suburb.
Scraping data for glenroy (3046)
No more results for glenroy on page 4.


100% successful: 100%|██████████| 60/60 [01:03<00:00,  1.05s/it]


Scraping data for hadfield (3046)
No more results for hadfield on page 2.


100% successful: 100%|██████████| 7/7 [00:06<00:00,  1.13it/s]


Scraping data for oak-park (3046)
No more results for oak-park on page 2.


100% successful: 100%|██████████| 12/12 [00:10<00:00,  1.12it/s]


Scraping data for broadmeadows (3047)
No more results for broadmeadows on page 3.


100% successful: 100%|██████████| 23/23 [00:25<00:00,  1.13s/it]


Scraping data for dallas (3047)
No more results for dallas on page 2.


100% successful: 100%|██████████| 4/4 [00:04<00:00,  1.19s/it]


Scraping data for jacana (3047)
No more results for jacana on page 2.


100% successful: 100%|██████████| 2/2 [00:01<00:00,  1.03it/s]


Scraping data for coolaroo (3048)
No more results for coolaroo on page 2.


100% successful: 100%|██████████| 2/2 [00:01<00:00,  1.16it/s]


Scraping data for meadow-heights (3048)
No more results for meadow-heights on page 2.


100% successful: 100%|██████████| 19/19 [00:22<00:00,  1.17s/it]


Scraping data for attwood (3049)
No more results for attwood on page 1.
No results for attwood. Moving to the next suburb.
Scraping data for westmeadows (3049)
No more results for westmeadows on page 2.


100% successful: 100%|██████████| 6/6 [00:07<00:00,  1.30s/it]

chunk finished





In [11]:
property_metadata.show(5)

24/09/13 10:45:58 WARN DAGScheduler: Broadcasting large task binary with size 28.3 MiB
24/09/13 10:45:59 WARN DAGScheduler: Broadcasting large task binary with size 28.3 MiB
24/09/13 10:46:00 WARN DAGScheduler: Broadcasting large task binary with size 28.3 MiB
24/09/13 10:46:06 WARN DAGScheduler: Broadcasting large task binary with size 28.3 MiB
[Stage 2:>                                                          (0 + 0) / 4]

+--------------------+--------+---------+--------------------+-------------+----+-----+-------+--------------------+
|                 url|postcode|   suburb|                name|    cost_text|beds|baths|parking|       property_type|
+--------------------+--------+---------+--------------------+-------------+----+-----+-------+--------------------+
|https://www.domai...|    3000|melbourne|1503/270 King Str...|$850 Per Week|   4|    1|      1|Apartment / Unit ...|
|https://www.domai...|    3000|melbourne|3408/138 Spencer ...|      $625 pw|   1|    1|      0|Apartment / Unit ...|
|https://www.domai...|    3000|melbourne|103/300 Swanston ...|      $620 pw|   1|    1|      1|Apartment / Unit ...|
|https://www.domai...|    3000|melbourne|201/23 Queens Roa...|         $600|   2|    1|      1|Apartment / Unit ...|
|https://www.domai...|    3000|melbourne|804/225 Elizabeth...|      $570.00|   2|    1|      0|Apartment / Unit ...|
+--------------------+--------+---------+--------------------+--

                                                                                

[Stage 3:>                                                          (0 + 4) / 4]

In [4]:
property_metadata.write.mode("overwrite").parquet("../data/raw/work_3050.parquet")

24/09/13 12:02:38 WARN DAGScheduler: Broadcasting large task binary with size 24.8 MiB
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/home/ulizuli/real_estate_data/real estate data/project-2-group-real-estate-industry-project-22/env/lib/python3.10/site-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/ulizuli/real_estate_data/real estate data/project-2-group-real-estate-industry-project-22/env/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/ulizuli/real_estate_data/real estate data/project-2-group-real-estate-industry-project-22/env/lib/python3.10/site-packages/py4j/clientserver.py", line 539, in send_comman

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 55108)
Traceback (most recent call last):
  File "/usr/lib/python3.10/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/lib/python3.10/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/usr/lib/python3.10/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/lib/python3.10/socketserver.py", line 747, in __init__
    self.handle()
  File "/home/ulizuli/real_estate_data/real estate data/project-2-group-real-estate-industry-project-22/env/lib/python3.10/site-packages/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/home/ulizuli/real_estate_data/real estate data/project-2-group-real-estate-industry-project-22/env/lib/python3.10/site-packages/pyspark/accumulators.py", lin

Py4JError: An error occurred while calling o59850.parquet

AttributeError: 'SparkSession' object has no attribute 'eventLog'

In [42]:
temp = suburbs_df[suburbs_df['postcode'] >= 3950]
chunk_dict['chunk_3997'] = temp[temp['postcode'] < 3997]

In [31]:
for i in chunk_dict[1:]:
    start_scrape(chunk_dict[i])

TypeError: 'generator' object is not subscriptable

In [4]:

schema = StructType([
        StructField("url", StringType(), True),
        StructField("postcode", StringType(), True),
        StructField("suburb", StringType(), True),
        StructField("name", StringType(), True),
        StructField("cost_text", StringType(), True),
        StructField("beds", StringType(), True),  # Separate field for beds
        StructField("baths", StringType(), True),  # Separate field for baths
        StructField("parking", StringType(), True),  # Parking field
        StructField("property_type", StringType(), True),  # Property type field
    ])
work = spark.read.schema(schema).json('../data/raw/work_3050.json')

In [7]:
work.show()

+---+--------+------+----+---------+----+-----+-------+-------------+
|url|postcode|suburb|name|cost_text|beds|baths|parking|property_type|
+---+--------+------+----+---------+----+-----+-------+-------------+
+---+--------+------+----+---------+----+-----+-------+-------------+



In [49]:
import pandas as pd
import os

# Define the folder path containing JSON files
folder_path = '../data/raw/work.json'

# List all files in the directory
json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]

# Initialize an empty list to store DataFrames
dataframes = []

# Read each JSON file into a DataFrame
for file in json_files:
    file_path = os.path.join(folder_path, file)
    # Read JSON file
    df = pd.read_json(file_path, lines=True)
    # Append the DataFrame to the list
    dataframes.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

# Display the combined DataFrame
print(combined_df.head(12))


                                                  url  postcode   suburb  \
0   https://www.domain.com.au/34-evadene-drive-tar...      3029  tarneit   
1   https://www.domain.com.au/434-bethany-road-tar...      3029  tarneit   
2   https://www.domain.com.au/58-antonio-road-tarn...      3029  tarneit   
3   https://www.domain.com.au/12-lindeman-street-t...      3029  tarneit   
4   https://www.domain.com.au/3-imatra-loop-tarnei...      3029  tarneit   
5   https://www.domain.com.au/84-lucania-crescent-...      3029  tarneit   
6   https://www.domain.com.au/8-keeping-terrace-ta...      3029  tarneit   
7   https://www.domain.com.au/40-kamala-drive-tarn...      3029  tarneit   
8   https://www.domain.com.au/48-riland-boulevard-...      3029  tarneit   
9   https://www.domain.com.au/9-ceremony-drive-tar...      3029  tarneit   
10  https://www.domain.com.au/11-ogawa-walk-tarnei...      3029  tarneit   
11  https://www.domain.com.au/tarneit-vic-3029-151...      3029  tarneit   

           