In [3]:
## METHOD 1: convert dictionary to spark dataframe and append to initialized sdf
# built-in imports
import re
from json import dump, load
from tqdm import tqdm
from collections import defaultdict
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import pandas as pd  
import os
# Import Spark modules
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Domain Scraper") \
    .getOrCreate()

#### create a spark data frame

# constants
BASE_URL = "https://www.domain.com.au"
N_PAGES = range(1, 2)  # Update this to your liking

#Scrape suburb from the address
def extract_suburb(address: str) -> str:
    """Extract the suburb name from the property address."""
    match = re.search(r'(?<=, )\w+', address)
    if match:
        return match.group(0)
    return "Unknown"


def start_scrape() -> None:
    """ Function that scrapes https://www.domain.com.au and outputs the data into a json file

    Parameters:
    None

    Returns:
    None
    """
    schema = StructType([
    StructField("url", StringType(), True),
    StructField("name", StringType(), True),
    StructField("cost_text", StringType(), True),
    StructField("rooms", StringType(), True),
    StructField("desc", StringType(), True),
    StructField("parking", StringType(), True),
    StructField("street", StringType(), True),
    StructField("suburb", StringType(), True),
    StructField("postcode", StringType(), True),
    StructField("propertyType", StringType(), True),
    StructField("school", StringType(), True),
    StructField("features", StringType(), True),
    ])

    # begin code
    url_links = []
    property_metadata = defaultdict(dict)
    sdf = spark.createDataFrame([],schema)
    

    # generate list of urls to visit
    for page in N_PAGES:
        url = BASE_URL + f"/rent/?excludedeposittaken=1&state=vic&page={page}"
        print(f"Visiting {url}")
        bs_object = BeautifulSoup(urlopen(Request(url, headers={'User-Agent': "PostmanRuntime/7.6.0"})), "lxml")

        # find the unordered list (ul) elements which are the results, then
        # find all href (a) tags that are from the base_url website.
        index_links = bs_object.find("ul", {"data-testid": "results"}).findAll(
            "a", href=re.compile(f"{BASE_URL}/*")  # the `*` denotes wildcard any
        )

        for link in index_links:
            # if it's a property address, add it to the list
            if 'address' in link.get('class', []):
                url_links.append(link['href'])

    # for each url, scrape some basic metadata
    pbar = tqdm(url_links)
    success_count, total_count = 0, 0

    for property_url in pbar:
        try:
            bs_object = BeautifulSoup(urlopen(Request(property_url, headers={'User-Agent': "PostmanRuntime/7.6.0"})), "lxml")
            total_count += 1

            property_page = urlopen(Request(property_url, headers={'User-Agent': "PostmanRuntime/7.6.0"}))
            property_soup = BeautifulSoup(property_page, "lxml")

            # looks for the header class to get property name
            property_metadata[property_url]['name'] = bs_object.find("h1", {"class": "css-164r41r"}).text.strip()

            # looks for the div containing a summary title for cost
            property_metadata[property_url]['cost_text'] = bs_object.find(
                "div", {"data-testid": "listing-details__summary-title"}
            ).text.strip()


            # get rooms and parking
            rooms = bs_object.find("div", {"data-testid": "property-features"}).findAll(
                "span", {"data-testid": "property-features-text-container"}
            )

            # rooms
            property_metadata[property_url]['rooms'] = ", ".join(
                [re.findall(r'\d+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'Bed' in feature.text or 'Bath' in feature.text]
            )

            # parking
            property_metadata[property_url]['parking'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'Parking' in feature.text]
            )

            # desc
            property_metadata[property_url]['desc'] = ", ".join(
                [re.findall(r'\d+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'desc' in feature.text]
            )
            
            # listingID:
            property_metadata[property_url]['listingid'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'listingId' in feature.text]
            )

            # street:
            property_metadata[property_url]['street'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'street' in feature.text]
            )

            # suburb:
            property_metadata[property_url]['suburb'] = extract_suburb(property_metadata[property_url]['name'])

            
            # postcode:
            property_metadata[property_url]['postcode'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'postcode' in feature.text]
            )

            # property type:
            property_metadata[property_url]['propertyType'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'apartment' in feature.text 
                 or 'unit' in feature.text or 'house' in feature.text or 'flat' in feature.text]
            )

            # schools:
            property_metadata[property_url]['school'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'school' in feature.text]
            )

            # features:
            property_metadata[property_url]['features'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'feature' in feature.text]
            )

            # loanfinder:
            property_metadata[property_url]['loan'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'loan' in feature.text]
            )

            # listingSummary:
            property_metadata[property_url]['listingsummary'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'summary' in feature.text]
            )

            # suburb insights:
            property_metadata[property_url]['suburbInsights'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'suburbInsights' in feature.text]
            )

            # property description
            property_metadata[property_url]['desc'] = bs_object.find("p").text.strip() if bs_object.find("p") else "N/A"


            # Scrape property description
            property_metadata[property_url]['desc'] = re.sub(r'<br\/>', '\n', str(property_soup.find("p"))).strip('</p>')
           
            """
            # Write each row to the CSV
            writer.writerow([
                property_url,
                property_metadata[property_url]['name'],
                property_metadata[property_url]['cost_text'],
                property_metadata[property_url]['rooms'],
                property_metadata[property_url]['parking'],
                property_metadata[property_url]['desc'],
                property_metadata[property_url]['listingid'],
                property_metadata[property_url]['street'],
                property_metadata[property_url]['suburb'],
                property_metadata[property_url]['postcode'],
                property_metadata[property_url]['propertyType'],
                property_metadata[property_url]['school'],
                property_metadata[property_url]['features'],
                property_metadata[property_url]['loan'],
                property_metadata[property_url]['listingsummary'],
                property_metadata[property_url]['suburbInsights']
            ])
            """
            success_count += 1
            temp_sdf = spark.createDataFrame(property_metadata)
            sdf.union(temp_sdf)

        except AttributeError:
            print(f"Issue with {property_url}")

        pbar.set_description(f"{(success_count / total_count * 100):.0f}% successful")

        # output to example json in data/raw/
    with open('../data/raw/example.json', 'w') as f:
        dump(property_metadata, f)

def convert_to_parquet(filepath: str, output_path: str) -> None:
    """ Function converts a json file into a parquet file

    Parameters:
    filepath (str): the filepath that locates our json data

    output_path (str): the filepath that we will place our new parquet file into

    Returns:
    None
    """
    with open(filepath) as f:
        data = load(f)

    new_data = change_json_format(data)

    # conversion from json -> dataframe -> parquet
    df = pd.DataFrame(new_data)
    df.to_parquet(output_path, engine='pyarrow')

    delete_json_file(filepath)

# function that changes the formatting of the json file
def change_json_format(data: dict) -> dict:
    """ Function grabs the renames the json keys to the words after the last backslash in the url and adds the url as an item

    Parameters:
    data (dict): json dictionary we are changing

    Returns:
    dict: our new json dictionary
    
    """
    new_data = {}
    for i in data.keys():
        new_name = i.rsplit('/', 1)[-1]
        new_data[new_name] = data[i]
        new_data[new_name]["href"] = i
    return new_data

def delete_json_file(filepath: str) -> None:
    """ Function deletes the json file we are converting from

    Parameters:
    filepath (string): filepath to the json file we are deleting

    Returns:
    None
    """
    try:
        os.remove(filepath)
        print(f"File '{filepath}' deleted successfully")
    except FileNotFoundError:
        print(f"File '{filepath}' not found")
    except PermissionError:
        print(f"Permission denied: '{filepath}'")
    except Exception as e:
        print(f"An error occurred: {e}")

24/09/14 16:42:27 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [1]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

from pyarrow import json
import pyarrow.parquet as pq

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/14 17:12:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
def convert_to_parquet(filepath: str, output_path: str) -> None:
    """ Function converts a JSON file into a parquet file """
    with open(filepath) as f:
        data = load(f)

    new_data = change_json_format(data)

    # Conversion from JSON -> DataFrame -> Parquet
    df = pd.DataFrame(new_data)
    df.to_parquet(output_path, engine='pyarrow')

    delete_json_file(filepath)

def change_json_format(data: dict) -> dict:
    """ Function renames JSON keys and adds the URL as an item """
    new_data = {}
    for i in data.keys():
        new_name = i.rsplit('/', 1)[-1]
        new_data[new_name] = data[i]
        new_data[new_name]["href"] = i
    return new_data

def delete_json_file(filepath: str) -> None:
    """ Function deletes the JSON file """
    try:
        os.remove(filepath)
        print(f"File '{filepath}' deleted successfully")
    except FileNotFoundError:
        print(f"File '{filepath}' not found")
    except PermissionError:
        print(f"Permission denied: '{filepath}'")
    except Exception as e:
        print(f"An error occurred: {e}")
    
def get_chunks(suburbs_df) -> dict:
    """function that splits up postcodes into chunks of 50 so that if we are kicked halfway during scraping we don't lose too much progress
    """
    chunk_dict = {}
    
    i = 3048
    j = 3023  
    while i < 3997:
        temp = suburbs_df[suburbs_df['postcode'] >= j]
        chunk_dict['chunk_{}'.format(i)] = temp[temp['postcode'] <= i]
        j += 25
        i += 25

    return chunk_dict

In [7]:
chunk_dict = get_chunks(suburbs_df)
for i in chunk_dict:
    print(i)

chunk_3048
chunk_3073
chunk_3098
chunk_3123
chunk_3148
chunk_3173
chunk_3198
chunk_3223
chunk_3248
chunk_3273
chunk_3298
chunk_3323
chunk_3348
chunk_3373
chunk_3398
chunk_3423
chunk_3448
chunk_3473
chunk_3498
chunk_3523
chunk_3548
chunk_3573
chunk_3598
chunk_3623
chunk_3648
chunk_3673
chunk_3698
chunk_3723
chunk_3748
chunk_3773
chunk_3798
chunk_3823
chunk_3848
chunk_3873
chunk_3898
chunk_3923
chunk_3948
chunk_3973


1. Run cell above
2. Run cell below 
3. Run cell below the cell below
4. Run property_metadata.write

In [35]:
# Working METHOD
import re
from json import dump
from tqdm import tqdm
from collections import defaultdict
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType
import pandas as pd

# Initialize Spark session
spark = SparkSession.builder.master('local[*]') \
    .config("spark.driver.memory", "15g") \
    .config("spark.executor.memory", "16g") \
    .appName("PropertyScraper") \
    .getOrCreate()

# Constants
BASE_URL = "https://www.domain.com.au"
N_PAGES = range(1, 50)  # Max number of pages you want to scrape  

# Load suburbs CSV
suburbs_df = pd.read_csv('postcodes.csv')  # Ensure this CSV contains 'suburb' and 'postcode' columns
chunk_dict = get_chunks(suburbs_df)

def start_scrape(chunk, file_suffix):
    """Function that scrapes https://www.domain.com.au and outputs the data into a JSON file
    
    parameters:
    chunk: chunk of 50 postcodes we will scrape
    file_suffix: what we want to title the end of our files when we write to json
    """

    # Define schema for the Spark DataFrame
    schema = StructType([
        StructField("url", StringType(), True),
        StructField("postcode", StringType(), True),
        StructField("suburb", StringType(), True),
        StructField("name", StringType(), True),
        StructField("cost_text", StringType(), True),
        StructField("beds", StringType(), True),  # Separate field for beds
        StructField("baths", StringType(), True),  # Separate field for baths
        StructField("parking", StringType(), True),  # Parking field
        StructField("property_type", StringType(), True),  # Property type field
    ])


    # Initialize an empty DataFrame with the schema
    property_metadata = spark.createDataFrame([], schema)

    # Loop through each suburb and its postcode
    for index, row in chunk.iterrows():
        suburb = row['locality'].lower().replace(' ', '-')  # Convert to lowercase and hyphenate
        postcode = row['postcode']

        print(f"Scraping data for {suburb} ({postcode})")

        url_links = []
        page_found = False  # This flag will help us track whether any results are found

        # Generate list of URLs to visit
        for page in N_PAGES:
            url = BASE_URL + f"/rent/{suburb}-vic-{postcode}/?ssubs=0&sort=suburb-asc&page={page}"
            try:
                bs_object = BeautifulSoup(urlopen(Request(url, headers={'User-Agent': "PostmanRuntime/7.6.0"})), "lxml")

                # Check if the page has results or shows "No results found"
                no_results = bs_object.find(text=re.compile("No results found", re.I))
                if no_results:
                    print(f"No results found for {suburb} on page {page}. Stopping further scraping for this suburb.")
                    break  # Exit the pagination loop if no results are found

                # Find property links
                index_links = bs_object.find("ul", {"data-testid": "results"})
                if not index_links:
                    print(f"No more results for {suburb} on page {page}.")
                    break  # Exit pagination if no results list is found (end of pages)

                index_links = index_links.findAll("a", href=re.compile(f"{BASE_URL}/*"))
                page_found = True  # At least one result was found on this page

                for link in index_links:
                    # If it's a property address, add it to the list
                    if 'address' in link.get('class', []):
                        url_links.append(link['href'])

            except Exception as e:
                print(f"Error fetching {url}: {e}")
                break  # Stop if there's an issue with fetching the page

        if not page_found:
            print(f"No results for {suburb}. Moving to the next suburb.")
            continue  # Skip to the next suburb if no pages were found for this one

        # For each URL, scrape some basic metadata
        pbar = tqdm(url_links)
        success_count, total_count = 0, 0

        for property_url in pbar:
            try:
                bs_object = BeautifulSoup(urlopen(Request(property_url, headers={'User-Agent': "PostmanRuntime/7.6.0"})), "lxml")
                total_count += 1

                # Get property name
                name = bs_object.find("h1", {"class": "css-164r41r"}).text.strip()

                # Get cost text
                cost_text = bs_object.find("div", {"data-testid": "listing-details__summary-title"}).text.strip()

                # Get rooms (beds and baths)
                rooms = bs_object.find("div", {"data-testid": "property-features"}).findAll(
                    "span", {"data-testid": "property-features-text-container"}
                )

                # Initialize variables
                beds, baths, parking = None, None, '0'  # Default value for parking is '0 Car'

                for feature in rooms:
                    text = feature.text
                    if 'Bed' in text:
                        beds_match = re.findall(r'\d+', text)
                        if beds_match:
                            beds = beds_match[0]  # Extract the number of beds
                    elif 'Bath' in text:
                        baths_match = re.findall(r'\d+', text)
                        if baths_match:
                            baths = baths_match[0]  # Extract the number of baths
                    elif 'Car' in text or 'Parking' in text:
                        parking_match = re.findall(r'\d+', text)
                        if parking_match:
                            parking = parking_match[0]  # Extract the number of parking spaces

                property_type_container = bs_object.find("div", {"data-testid": "listing-summary-property-type"})
                property_type = property_type_container.get_text(strip=True)

                # Create a row and append it to the DataFrame
                row = [(property_url, postcode, suburb, name, cost_text, beds, baths, parking, property_type)]
                row_df = spark.createDataFrame(row, schema)
                property_metadata = property_metadata.union(row_df)
                success_count += 1

            except AttributeError:
                print(f"Error scraping {property_url}: missing data")

            pbar.set_description(f"{(success_count / total_count * 100):.0f}% successful")

        # Show the DataFrame to ensure data is being appended
        #property_metadata.show()

    # Output to parquet file
    try:
        property_metadata.write.mode("overwrite").parquet('../data/raw/work_{}.parquet'.format(file_suffix))
        print(f"Data successfully written")
    except Exception as e:
       print(f"An error occured: {e}")

    #added this print statement so that the cell output can be scrollable - it's getting annoying to click the scroll bar >:(
    print("chunk finished")
    #return property_metadata



In [8]:
# start scraping by chunks of 50
#for i in chunk_dict:
 #   start_scrape(chunk_dict[i], i.split("_")[1])
property_metadata = start_scrape(chunk_dict['chunk_3000'], '3000')  ## changed 3050 , 3050

Scraping data for melbourne (3000)


  no_results = bs_object.find(text=re.compile("No results found", re.I))


No more results for melbourne on page 45.


100% successful: 100%|██████████| 869/869 [08:26<00:00,  1.72it/s]

chunk finished





In [9]:
property_metadata.show(5)

24/09/14 17:28:53 WARN DAGScheduler: Broadcasting large task binary with size 6.1 MiB
24/09/14 17:28:55 WARN DAGScheduler: Broadcasting large task binary with size 6.1 MiB
24/09/14 17:28:55 WARN DAGScheduler: Broadcasting large task binary with size 6.1 MiB
24/09/14 17:28:57 WARN DAGScheduler: Broadcasting large task binary with size 6.1 MiB


+--------------------+--------+---------+--------------------+-------------+----+-----+-------+--------------------+
|                 url|postcode|   suburb|                name|    cost_text|beds|baths|parking|       property_type|
+--------------------+--------+---------+--------------------+-------------+----+-----+-------+--------------------+
|https://www.domai...|    3000|melbourne|3113/639 Lonsdale...|    $1,200.00|   3|    2|      1|Apartment / Unit ...|
|https://www.domai...|    3000|melbourne|1503/270 King Str...|$850 Per Week|   4|    1|      1|Apartment / Unit ...|
|https://www.domai...|    3000|melbourne|403/639 Lonsdale ...| $750per week|   2|    2|      0|Apartment / Unit ...|
|https://www.domai...|    3000|melbourne|3409/138 Spencer ...|      $625 pw|   1|    1|      0|Apartment / Unit ...|
|https://www.domai...|    3000|melbourne|103/300 Swanston ...|      $620 pw|   1|    1|      1|Apartment / Unit ...|
+--------------------+--------+---------+--------------------+--

In [10]:
property_metadata.write.mode("overwrite").parquet("../data/raw/work_3000.parquet") 

24/09/14 17:29:53 WARN DAGScheduler: Broadcasting large task binary with size 5.5 MiB
                                                                                

In [11]:
sdf = spark.read.parquet("../data/raw/work_3000.parquet")

In [12]:
sdf

url,postcode,suburb,name,cost_text,beds,baths,parking,property_type
https://www.domai...,3000,melbourne,4307/639 Little L...,$600 and Fully Fu...,1,1,0,Apartment / Unit ...
https://www.domai...,3000,melbourne,2213/27 Little Co...,$750 a week and F...,2,2,0,Apartment / Unit ...
https://www.domai...,3000,melbourne,4207/371 Little L...,$720 per week opp...,2,1,0,Apartment / Unit ...
https://www.domai...,3000,melbourne,99 Franklin Stree...,"Furnished, bills,...",1,1,0,Apartment / Unit ...
https://www.domai...,3000,melbourne,1302/279-283 La T...,$650 and Fully Fu...,2,1,0,Apartment / Unit ...
https://www.domai...,3000,melbourne,103/19 Exploratio...,$540 Per Week Inc...,1,1,0,Studio
https://www.domai...,3000,melbourne,913/22-24 Jane Be...,$520 and Fully Fu...,1,1,1,Apartment / Unit ...
https://www.domai...,3000,melbourne,1202/601 Little C...,"$620 per week, $2...",2,1,0,Apartment / Unit ...
https://www.domai...,3000,melbourne,4211/371 Little L...,$750 Per Week,2,1,0,Apartment / Unit ...
https://www.domai...,3000,melbourne,1308/138 Spencer ...,$800 and Fully Fu...,2,2,0,Apartment / Unit ...


In [5]:
temp = suburbs_df[suburbs_df['postcode'] >= 3950]
chunk_dict['chunk_3997'] = temp[temp['postcode'] < 3997]

In [None]:
postcode = list(range(3001, 4000))  # List of postcodes
chunk_size = 50  # Define the chunk size

# Loop over the postcodes in chunks of 50
for i in range(0, len(postcode), chunk_size):
    # Extract a chunk of 50 postcodes
    chunk = postcode[i:i + chunk_size]
    
    # Convert chunk to string or appropriate format for your function
    chunk_name = f'chunk_{i // chunk_size + 1}'
    
    # Call start_scrape function with the chunk
    property_metadata = start_scrape(chunk_dict[chunk_name], f'{chunk}')


In [6]:
for i in chunk_dict[1:]:
    start_scrape(chunk_dict[i])

TypeError: unhashable type: 'slice'

In [7]:

schema = StructType([
        StructField("url", StringType(), True),
        StructField("postcode", StringType(), True),
        StructField("suburb", StringType(), True),
        StructField("name", StringType(), True),
        StructField("cost_text", StringType(), True),
        StructField("beds", StringType(), True),  # Separate field for beds
        StructField("baths", StringType(), True),  # Separate field for baths
        StructField("parking", StringType(), True),  # Parking field
        StructField("property_type", StringType(), True),  # Property type field
    ])
work = spark.read.schema(schema).json('../data/raw/work_3050.json')

AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/Users/davynr06/Documents/MAST30034/project-2-group-real-estate-industry-project-22/data/raw/work_3050.json.

In [7]:
work.show()

+---+--------+------+----+---------+----+-----+-------+-------------+
|url|postcode|suburb|name|cost_text|beds|baths|parking|property_type|
+---+--------+------+----+---------+----+-----+-------+-------------+
+---+--------+------+----+---------+----+-----+-------+-------------+



In [49]:
import pandas as pd
import os

# Define the folder path containing JSON files
folder_path = '../data/raw/work.json'

# List all files in the directory
json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]

# Initialize an empty list to store DataFrames
dataframes = []

# Read each JSON file into a DataFrame
for file in json_files:
    file_path = os.path.join(folder_path, file)
    # Read JSON file
    df = pd.read_json(file_path, lines=True)
    # Append the DataFrame to the list
    dataframes.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

# Display the combined DataFrame
print(combined_df.head(12))


                                                  url  postcode   suburb  \
0   https://www.domain.com.au/34-evadene-drive-tar...      3029  tarneit   
1   https://www.domain.com.au/434-bethany-road-tar...      3029  tarneit   
2   https://www.domain.com.au/58-antonio-road-tarn...      3029  tarneit   
3   https://www.domain.com.au/12-lindeman-street-t...      3029  tarneit   
4   https://www.domain.com.au/3-imatra-loop-tarnei...      3029  tarneit   
5   https://www.domain.com.au/84-lucania-crescent-...      3029  tarneit   
6   https://www.domain.com.au/8-keeping-terrace-ta...      3029  tarneit   
7   https://www.domain.com.au/40-kamala-drive-tarn...      3029  tarneit   
8   https://www.domain.com.au/48-riland-boulevard-...      3029  tarneit   
9   https://www.domain.com.au/9-ceremony-drive-tar...      3029  tarneit   
10  https://www.domain.com.au/11-ogawa-walk-tarnei...      3029  tarneit   
11  https://www.domain.com.au/tarneit-vic-3029-151...      3029  tarneit   

           

Please run these code chunks after first running create_chunk(), start_scrape() and run_chunk() methods: \
1st Cell: Davyn \
2nd Cell: Arpan \
3rd Cell: Priscilla \
4th Cell: Rachel \
5th Cell: Nathan 



In [37]:
def run_chunk(starting_chunk):
    i = starting_chunk
    # we are running chunks of 25 postcodes 7 times each
    while i < starting_chunk + 175:
        start_scrape(chunk_dict["chunk_{}".format(i)], i) #i.split("_")[1])
        i += 25
    if i == 3923:
        temp = suburbs_df[suburbs_df['postcode'] >= i + 1]
        chunk_dict['chunk_3996'] = temp[temp['postcode'] < 3997]
        start_scrape(chunk_dict['chunk_3996'], 3996)
        
    


In [None]:
#Davyn
starting_chunk = 3048
run_chunk(starting_chunk)

In [None]:
#Arpan
starting_chunk = 3048 + 175
run_chunk(starting_chunk)

In [None]:
#Priscilla
starting_chunk = 3048 + 350
run_chunk(starting_chunk)

In [None]:
#Rachel
starting_chunk = 3048 + 525
run_chunk(starting_chunk)

In [38]:
#Nathan
starting_chunk = 3048 + 700
run_chunk(starting_chunk)

Scraping data for archerton (3723)


  no_results = bs_object.find(text=re.compile("No results found", re.I))


No more results for archerton on page 1.
No results for archerton. Moving to the next suburb.
Scraping data for barjarg (3723)
No more results for barjarg on page 1.
No results for barjarg. Moving to the next suburb.
Scraping data for boorolite (3723)
No more results for boorolite on page 1.
No results for boorolite. Moving to the next suburb.
Scraping data for bridge-creek (3723)
No more results for bridge-creek on page 1.
No results for bridge-creek. Moving to the next suburb.
Scraping data for delatite (3723)
No more results for delatite on page 1.
No results for delatite. Moving to the next suburb.
Scraping data for enochs-point (3723)
No more results for enochs-point on page 1.
No results for enochs-point. Moving to the next suburb.
Scraping data for gaffneys-creek (3723)
No more results for gaffneys-creek on page 1.
No results for gaffneys-creek. Moving to the next suburb.
Scraping data for goughs-bay (3723)
No more results for goughs-bay on page 1.
No results for goughs-bay. Mov

100% successful: 100%|██████████| 2/2 [00:01<00:00,  1.36it/s]


Scraping data for mount-buller (3723)
No more results for mount-buller on page 1.
No results for mount-buller. Moving to the next suburb.
Scraping data for mountain-bay (3723)
No more results for mountain-bay on page 1.
No results for mountain-bay. Moving to the next suburb.
Scraping data for nillahcootie (3723)
Error fetching https://www.domain.com.au/rent/nillahcootie-vic-3723/?ssubs=0&sort=suburb-asc&page=1: HTTP Error 404: Not Found
No results for nillahcootie. Moving to the next suburb.
Scraping data for piries (3723)
No more results for piries on page 1.
No results for piries. Moving to the next suburb.
Scraping data for sawmill-settlement (3723)
No more results for sawmill-settlement on page 1.
No results for sawmill-settlement. Moving to the next suburb.
Scraping data for tolmie (3723)
No more results for tolmie on page 1.
No results for tolmie. Moving to the next suburb.
Scraping data for woods-point (3723)
No more results for woods-point on page 1.
No results for woods-point.

100% successful: 100%|██████████| 1/1 [00:00<00:00,  2.31it/s]


Scraping data for major-plains (3725)
No more results for major-plains on page 1.
No results for major-plains. Moving to the next suburb.
Scraping data for stewarton (3725)
No more results for stewarton on page 1.
No results for stewarton. Moving to the next suburb.
Scraping data for bungeet (3726)
No more results for bungeet on page 1.
No results for bungeet. Moving to the next suburb.
Scraping data for bungeet-west (3726)
No more results for bungeet-west on page 1.
No results for bungeet-west. Moving to the next suburb.
Scraping data for devenish (3726)
No more results for devenish on page 1.
No results for devenish. Moving to the next suburb.
Scraping data for thoona (3726)
No more results for thoona on page 1.
No results for thoona. Moving to the next suburb.
Scraping data for almonds (3727)
No more results for almonds on page 1.
No results for almonds. Moving to the next suburb.
Scraping data for lake-rowan (3727)
No more results for lake-rowan on page 1.
No results for lake-rowan

100% successful: 100%|██████████| 1/1 [00:00<00:00,  1.70it/s]


Scraping data for wilby (3728)
No more results for wilby on page 1.
No results for wilby. Moving to the next suburb.
Scraping data for youarang (3728)
No more results for youarang on page 1.
No results for youarang. Moving to the next suburb.
Scraping data for bathumi (3730)
No more results for bathumi on page 1.
No results for bathumi. Moving to the next suburb.
Scraping data for boosey (3730)
No more results for boosey on page 1.
No results for boosey. Moving to the next suburb.
Scraping data for bundalong (3730)
No more results for bundalong on page 2.


100% successful: 100%|██████████| 2/2 [00:01<00:00,  1.86it/s]


Scraping data for bundalong-south (3730)
No more results for bundalong-south on page 1.
No results for bundalong-south. Moving to the next suburb.
Scraping data for burramine (3730)
No more results for burramine on page 1.
No results for burramine. Moving to the next suburb.
Scraping data for burramine-south (3730)
No more results for burramine-south on page 1.
No results for burramine-south. Moving to the next suburb.
Scraping data for esmond (3730)
No more results for esmond on page 1.
No results for esmond. Moving to the next suburb.
Scraping data for telford (3730)
No more results for telford on page 1.
No results for telford. Moving to the next suburb.
Scraping data for yarrawonga (3730)
No more results for yarrawonga on page 2.


100% successful:  39%|███▉      | 7/18 [00:03<00:05,  1.97it/s]


KeyboardInterrupt: 

In [27]:
3048+175 + 175

3398

chunk_3048
chunk_3073
chunk_3098
chunk_3123
chunk_3148
chunk_3173
chunk_3198
chunk_3223
chunk_3248
chunk_3273
chunk_3298
chunk_3323
chunk_3348
chunk_3373
chunk_3398
chunk_3423
chunk_3448
chunk_3473
chunk_3498
chunk_3523
chunk_3548
chunk_3573
chunk_3598
chunk_3623
chunk_3648
chunk_3673
chunk_3698
chunk_3723
chunk_3748
chunk_3773
chunk_3798
chunk_3823
chunk_3848
chunk_3873
chunk_3898
chunk_3923
chunk_3948
chunk_3973
