Jupyter notebook that scrapes domain.com.au and outputs the information into parquet files


In [18]:
def convert_to_parquet(filepath: str, output_path: str) -> None:
    """ Function converts a JSON file into a parquet file """
    with open(filepath) as f:
        data = load(f)

    new_data = change_json_format(data)

    # Conversion from JSON -> DataFrame -> Parquet
    df = pd.DataFrame(new_data)
    df.to_parquet(output_path, engine='pyarrow')

    delete_json_file(filepath)

def change_json_format(data: dict) -> dict:
    """ Function renames JSON keys and adds the URL as an item """
    new_data = {}
    for i in data.keys():
        new_name = i.rsplit('/', 1)[-1]
        new_data[new_name] = data[i]
        new_data[new_name]["href"] = i
    return new_data

def delete_json_file(filepath: str) -> None:
    """ Function deletes the JSON file """
    try:
        os.remove(filepath)
        print(f"File '{filepath}' deleted successfully")
    except FileNotFoundError:
        print(f"File '{filepath}' not found")
    except PermissionError:
        print(f"Permission denied: '{filepath}'")
    except Exception as e:
        print(f"An error occurred: {e}")
    
def get_chunks(suburbs_df) -> dict:
    """function that splits up postcodes into chunks of 50 so that if we are kicked halfway during scraping we don't lose too much progress
    """
    chunk_dict = {}
    
    i = 3048
    j = 3023  
    while i < 3997:
        temp = suburbs_df[suburbs_df['postcode'] > j]
        chunk_dict['chunk_{}'.format(i)] = temp[temp['postcode'] <= i]
        j += 25
        i += 25

    return chunk_dict



In [19]:
# Working METHOD
import re
from json import dump
from tqdm import tqdm
from collections import defaultdict
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType
import pandas as pd

# Initialize Spark session
spark = SparkSession.builder.master('local[*]') \
    .config("spark.driver.memory", "15g") \
    .config("spark.executor.memory", "16g") \
    .appName("PropertyScraper") \
    .getOrCreate()

# Constants
BASE_URL = "https://www.domain.com.au"
N_PAGES = range(1, 50)  # Max number of pages you want to scrape  

# Load suburbs CSV
suburbs_df = pd.read_csv('postcodes.csv')  # Ensure this CSV contains 'suburb' and 'postcode' columns
chunk_dict = get_chunks(suburbs_df)

def start_scrape(chunk, file_suffix):
    """Function that scrapes https://www.domain.com.au and outputs the data into a JSON file
    
    parameters:
    chunk: chunk of 50 postcodes we will scrape
    file_suffix: what we want to title the end of our files when we write to json
    """

    # Define schema for the Spark DataFrame
    schema = StructType([
        StructField("url", StringType(), True),
        StructField("postcode", StringType(), True),
        StructField("suburb", StringType(), True),
        StructField("name", StringType(), True),
        StructField("cost_text", StringType(), True),
        StructField("beds", StringType(), True),  # Separate field for beds
        StructField("baths", StringType(), True),  # Separate field for baths
        StructField("parking", StringType(), True),  # Parking field
        StructField("property_type", StringType(), True),  # Property type field
    ])


    # Initialize an empty DataFrame with the schema
    property_metadata = spark.createDataFrame([], schema)

    # Loop through each suburb and its postcode
    for index, row in chunk.iterrows():
        suburb = row['locality'].lower().replace(' ', '-')  # Convert to lowercase and hyphenate
        postcode = row['postcode']

        print(f"Scraping data for {suburb} ({postcode})")

        url_links = []
        page_found = False  # This flag will help us track whether any results are found

        # Generate list of URLs to visit
        for page in N_PAGES:
            url = BASE_URL + f"/rent/{suburb}-vic-{postcode}/?ssubs=0&sort=suburb-asc&page={page}"
            try:
                bs_object = BeautifulSoup(urlopen(Request(url, headers={'User-Agent': "PostmanRuntime/7.6.0"})), "lxml")

                # Check if the page has results or shows "No results found"
                no_results = bs_object.find(text=re.compile("No results found", re.I))
                if no_results:
                    print(f"No results found for {suburb} on page {page}. Stopping further scraping for this suburb.")
                    break  # Exit the pagination loop if no results are found

                # Find property links
                index_links = bs_object.find("ul", {"data-testid": "results"})
                if not index_links:
                    print(f"No more results for {suburb} on page {page}.")
                    break  # Exit pagination if no results list is found (end of pages)

                index_links = index_links.findAll("a", href=re.compile(f"{BASE_URL}/*"))
                page_found = True  # At least one result was found on this page

                for link in index_links:
                    # If it's a property address, add it to the list
                    if 'address' in link.get('class', []):
                        url_links.append(link['href'])

            except Exception as e:
                print(f"Error fetching {url}: {e}")
                break  # Stop if there's an issue with fetching the page

        if not page_found:
            print(f"No results for {suburb}. Moving to the next suburb.")
            continue  # Skip to the next suburb if no pages were found for this one

        # For each URL, scrape some basic metadata
        pbar = tqdm(url_links)
        success_count, total_count = 0, 0

        for property_url in pbar:
            try:
                bs_object = BeautifulSoup(urlopen(Request(property_url, headers={'User-Agent': "PostmanRuntime/7.6.0"})), "lxml")
                total_count += 1

                # Get property name
                name = bs_object.find("h1", {"class": "css-164r41r"}).text.strip()

                # Get cost text
                cost_text = bs_object.find("div", {"data-testid": "listing-details__summary-title"}).text.strip()

                # Get rooms (beds and baths)
                rooms = bs_object.find("div", {"data-testid": "property-features"}).findAll(
                    "span", {"data-testid": "property-features-text-container"}
                )

                # Initialize variables
                beds, baths, parking = None, None, '0'  # Default value for parking is '0 Car'

                for feature in rooms:
                    text = feature.text
                    if 'Bed' in text:
                        beds_match = re.findall(r'\d+', text)
                        if beds_match:
                            beds = beds_match[0]  # Extract the number of beds
                    elif 'Bath' in text:
                        baths_match = re.findall(r'\d+', text)
                        if baths_match:
                            baths = baths_match[0]  # Extract the number of baths
                    elif 'Car' in text or 'Parking' in text:
                        parking_match = re.findall(r'\d+', text)
                        if parking_match:
                            parking = parking_match[0]  # Extract the number of parking spaces

                property_type_container = bs_object.find("div", {"data-testid": "listing-summary-property-type"})
                property_type = property_type_container.get_text(strip=True)

                # Create a row and append it to the DataFrame
                row = [(property_url, postcode, suburb, name, cost_text, beds, baths, parking, property_type)]
                row_df = spark.createDataFrame(row, schema)
                property_metadata = property_metadata.union(row_df)
                success_count += 1

            except AttributeError:
                print(f"Error scraping {property_url}: missing data")

            pbar.set_description(f"{(success_count / total_count * 100):.0f}% successful")

        # Show the DataFrame to ensure data is being appended
        #property_metadata.show()

    # Output to parquet file
    try:
        property_metadata.write.mode("overwrite").parquet('../data/raw/work_{}.parquet'.format(file_suffix))
        print(f"Data successfully written")
    except Exception as e:
       print(f"An error occured: {e}")

    #added this print statement so that the cell output can be scrollable - it's getting annoying to click the scroll bar >:(
    print("chunk finished")
    #return property_metadata


In [16]:
def run_chunk(starting_chunk: int) -> None:
    """Function that scrapes domain.com.au in chunks of 25 postcodes 7 times (split amongst group members)
    
    Parameters:
    starting_chunk - starting chunk number that we want

    Return:
    None 
    """
    i = starting_chunk
    # we are running chunks of 25 postcodes 7 times each
    while i < starting_chunk + 175:
        start_scrape(chunk_dict["chunk_{}".format(i)], i) #i.split("_")[1])
        i += 25
    if i == 3923:
        start_scrape(chunk_dict["chunk_3923"], 3923)
        temp = suburbs_df[suburbs_df['postcode'] >= i + 1]
        chunk_dict['chunk_3996'] = temp[temp['postcode'] < 3997]
        start_scrape(chunk_dict['chunk_3996'], 3996)

In [20]:
#Davyn
starting_chunk = 3148
run_chunk(starting_chunk)

Scraping data for auburn (3123)
Error fetching https://www.domain.com.au/rent/auburn-vic-3123/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for auburn. Moving to the next suburb.
Scraping data for hawthorn-east (3123)


  no_results = bs_object.find(text=re.compile("No results found", re.I))


No more results for hawthorn-east on page 6.


100% successful: 100%|██████████| 97/97 [01:16<00:00,  1.27it/s]


Scraping data for camberwell (3124)
No more results for camberwell on page 4.


100% successful: 100%|██████████| 48/48 [00:41<00:00,  1.17it/s]


Scraping data for camberwell-north (3124)
Error fetching https://www.domain.com.au/rent/camberwell-north-vic-3124/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for camberwell-north. Moving to the next suburb.
Scraping data for camberwell-south (3124)
Error fetching https://www.domain.com.au/rent/camberwell-south-vic-3124/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for camberwell-south. Moving to the next suburb.
Scraping data for camberwell-west (3124)
Error fetching https://www.domain.com.au/rent/camberwell-west-vic-3124/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for camberwell-west. Moving to the next suburb.
Scraping data for hartwell (3124)
No more results for hartwell on page 1.
No results for hartwell. Moving to the next suburb.
Scraping data for middle-camberwell (3124)
Error fetching https://www.domain.com.au/rent/middle-camberwell-vic-3124/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No 

100% successful: 100%|██████████| 60/60 [00:50<00:00,  1.18it/s]


Scraping data for surrey-hills-south (3125)
Error fetching https://www.domain.com.au/rent/surrey-hills-south-vic-3125/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for surrey-hills-south. Moving to the next suburb.
Scraping data for camberwell-east (3126)
No more results for camberwell-east on page 1.
No results for camberwell-east. Moving to the next suburb.
Scraping data for canterbury (3126)
No more results for canterbury on page 2.


100% successful: 100%|██████████| 16/16 [00:13<00:00,  1.20it/s]


Scraping data for mont-albert (3127)
No more results for mont-albert on page 2.


100% successful: 100%|██████████| 18/18 [00:14<00:00,  1.26it/s]


Scraping data for surrey-hills (3127)
No more results for surrey-hills on page 3.


100% successful: 100%|██████████| 29/29 [00:25<00:00,  1.15it/s]


Scraping data for surrey-hills-north (3127)
Error fetching https://www.domain.com.au/rent/surrey-hills-north-vic-3127/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for surrey-hills-north. Moving to the next suburb.
Scraping data for box-hill (3128)
No more results for box-hill on page 7.


100% successful: 100%|██████████| 116/116 [01:45<00:00,  1.10it/s]


Scraping data for box-hill-central (3128)
Error fetching https://www.domain.com.au/rent/box-hill-central-vic-3128/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for box-hill-central. Moving to the next suburb.
Scraping data for box-hill-south (3128)
No more results for box-hill-south on page 2.


100% successful: 100%|██████████| 10/10 [00:07<00:00,  1.40it/s]


Scraping data for houston (3128)
Error fetching https://www.domain.com.au/rent/houston-vic-3128/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for houston. Moving to the next suburb.
Scraping data for wattle-park (3128)
Error fetching https://www.domain.com.au/rent/wattle-park-vic-3128/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for wattle-park. Moving to the next suburb.
Scraping data for box-hill-north (3129)
No more results for box-hill-north on page 3.


100% successful: 100%|██████████| 30/30 [00:27<00:00,  1.08it/s]


Scraping data for kerrimuir (3129)
Error fetching https://www.domain.com.au/rent/kerrimuir-vic-3129/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for kerrimuir. Moving to the next suburb.
Scraping data for mont-albert-north (3129)
No more results for mont-albert-north on page 2.


100% successful: 100%|██████████| 13/13 [00:15<00:00,  1.21s/it]


Scraping data for blackburn (3130)
No more results for blackburn on page 3.


100% successful:  14%|█▍        | 4/28 [00:03<00:23,  1.01it/s]


KeyboardInterrupt: 

In [None]:
#Arpan
starting_chunk = 3048 + 175
run_chunk(starting_chunk)

In [None]:
#Priscilla
starting_chunk = 3048 + 350
run_chunk(starting_chunk)

In [None]:
#Rachel
starting_chunk = 3048 + 525
run_chunk(starting_chunk)

In [None]:
#Nathan
starting_chunk = 3048 + 700
run_chunk(starting_chunk)

In [7]:
#scrape malvern
temp = suburbs_df[suburbs_df['postcode'] >= 3123]
chunk_dict['chunk_3148'] = temp[temp['postcode'] <= 3148]
start_scrape(chunk_dict['chunk_3148'], 3148)

Scraping data for auburn (3123)
Error fetching https://www.domain.com.au/rent/auburn-vic-3123/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for auburn. Moving to the next suburb.
Scraping data for hawthorn-east (3123)


  no_results = bs_object.find(text=re.compile("No results found", re.I))


No more results for hawthorn-east on page 6.


100% successful: 100%|██████████| 97/97 [00:58<00:00,  1.66it/s]


Scraping data for camberwell (3124)
No more results for camberwell on page 4.


100% successful: 100%|██████████| 50/50 [00:27<00:00,  1.85it/s]


Scraping data for camberwell-north (3124)
Error fetching https://www.domain.com.au/rent/camberwell-north-vic-3124/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for camberwell-north. Moving to the next suburb.
Scraping data for camberwell-south (3124)
Error fetching https://www.domain.com.au/rent/camberwell-south-vic-3124/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for camberwell-south. Moving to the next suburb.
Scraping data for camberwell-west (3124)
Error fetching https://www.domain.com.au/rent/camberwell-west-vic-3124/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for camberwell-west. Moving to the next suburb.
Scraping data for hartwell (3124)
No more results for hartwell on page 1.
No results for hartwell. Moving to the next suburb.
Scraping data for middle-camberwell (3124)
Error fetching https://www.domain.com.au/rent/middle-camberwell-vic-3124/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No 

100% successful: 100%|██████████| 60/60 [00:35<00:00,  1.71it/s]


Scraping data for surrey-hills-south (3125)
Error fetching https://www.domain.com.au/rent/surrey-hills-south-vic-3125/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for surrey-hills-south. Moving to the next suburb.
Scraping data for camberwell-east (3126)
No more results for camberwell-east on page 1.
No results for camberwell-east. Moving to the next suburb.
Scraping data for canterbury (3126)
No more results for canterbury on page 2.


100% successful: 100%|██████████| 17/17 [00:10<00:00,  1.67it/s]


Scraping data for mont-albert (3127)
No more results for mont-albert on page 2.


100% successful: 100%|██████████| 18/18 [00:10<00:00,  1.69it/s]


Scraping data for surrey-hills (3127)
No more results for surrey-hills on page 3.


100% successful: 100%|██████████| 29/29 [00:16<00:00,  1.75it/s]


Scraping data for surrey-hills-north (3127)
Error fetching https://www.domain.com.au/rent/surrey-hills-north-vic-3127/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for surrey-hills-north. Moving to the next suburb.
Scraping data for box-hill (3128)
No more results for box-hill on page 7.


100% successful: 100%|██████████| 113/113 [01:08<00:00,  1.66it/s]


Scraping data for box-hill-central (3128)
Error fetching https://www.domain.com.au/rent/box-hill-central-vic-3128/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for box-hill-central. Moving to the next suburb.
Scraping data for box-hill-south (3128)
No more results for box-hill-south on page 2.


100% successful: 100%|██████████| 11/11 [00:07<00:00,  1.45it/s]


Scraping data for houston (3128)
Error fetching https://www.domain.com.au/rent/houston-vic-3128/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for houston. Moving to the next suburb.
Scraping data for wattle-park (3128)
Error fetching https://www.domain.com.au/rent/wattle-park-vic-3128/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for wattle-park. Moving to the next suburb.
Scraping data for box-hill-north (3129)
No more results for box-hill-north on page 3.


100% successful: 100%|██████████| 30/30 [00:17<00:00,  1.76it/s]


Scraping data for kerrimuir (3129)
Error fetching https://www.domain.com.au/rent/kerrimuir-vic-3129/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for kerrimuir. Moving to the next suburb.
Scraping data for mont-albert-north (3129)
No more results for mont-albert-north on page 2.


100% successful: 100%|██████████| 13/13 [00:07<00:00,  1.78it/s]


Scraping data for blackburn (3130)
No more results for blackburn on page 3.


100% successful: 100%|██████████| 30/30 [00:17<00:00,  1.76it/s]


Scraping data for blackburn-north (3130)
No more results for blackburn-north on page 2.


100% successful: 100%|██████████| 3/3 [00:02<00:00,  1.46it/s]


Scraping data for blackburn-south (3130)
No more results for blackburn-south on page 2.


100% successful: 100%|██████████| 11/11 [00:06<00:00,  1.57it/s]


Scraping data for laburnum (3130)
Error fetching https://www.domain.com.au/rent/laburnum-vic-3130/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for laburnum. Moving to the next suburb.
Scraping data for brentford-square (3131)
Error fetching https://www.domain.com.au/rent/brentford-square-vic-3131/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for brentford-square. Moving to the next suburb.
Scraping data for forest-hill (3131)
No more results for forest-hill on page 3.


100% successful: 100%|██████████| 24/24 [00:13<00:00,  1.74it/s]


Scraping data for nunawading (3131)
No more results for nunawading on page 3.


100% successful: 100%|██████████| 24/24 [00:14<00:00,  1.67it/s]


Scraping data for mitcham (3132)
No more results for mitcham on page 3.


100% successful: 100%|██████████| 33/33 [00:18<00:00,  1.81it/s]


Scraping data for mitcham-north (3132)
Error fetching https://www.domain.com.au/rent/mitcham-north-vic-3132/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for mitcham-north. Moving to the next suburb.
Scraping data for rangeview (3132)
Error fetching https://www.domain.com.au/rent/rangeview-vic-3132/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for rangeview. Moving to the next suburb.
Scraping data for vermont (3133)
No more results for vermont on page 2.


100% successful: 100%|██████████| 11/11 [00:08<00:00,  1.23it/s]


Scraping data for vermont-south (3133)
No more results for vermont-south on page 3.


100% successful: 100%|██████████| 21/21 [00:14<00:00,  1.49it/s]


Scraping data for heathwood (3134)
Error fetching https://www.domain.com.au/rent/heathwood-vic-3134/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for heathwood. Moving to the next suburb.
Scraping data for ringwood (3134)
No more results for ringwood on page 3.


100% successful: 100%|██████████| 33/33 [00:19<00:00,  1.71it/s]


Scraping data for ringwood-north (3134)
No more results for ringwood-north on page 2.


100% successful: 100%|██████████| 4/4 [00:02<00:00,  1.63it/s]


Scraping data for warrandyte-south (3134)
No more results for warrandyte-south on page 1.
No results for warrandyte-south. Moving to the next suburb.
Scraping data for warranwood (3134)
No more results for warranwood on page 2.


100% successful: 100%|██████████| 2/2 [00:01<00:00,  1.78it/s]


Scraping data for bedford-road (3135)
Error fetching https://www.domain.com.au/rent/bedford-road-vic-3135/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for bedford-road. Moving to the next suburb.
Scraping data for heathmont (3135)
No more results for heathmont on page 2.


100% successful: 100%|██████████| 4/4 [00:02<00:00,  1.64it/s]


Scraping data for ringwood-east (3135)
No more results for ringwood-east on page 2.


100% successful: 100%|██████████| 15/15 [00:10<00:00,  1.44it/s]


Scraping data for croydon (3136)
No more results for croydon on page 3.


100% successful:  71%|███████   | 17/24 [00:09<00:03,  1.80it/s]