In [70]:
## METHOD 1: convert dictionary to spark dataframe and append to initialized sdf
# built-in imports
import re
from json import dump, load
from tqdm import tqdm
from collections import defaultdict
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import pandas as pd  
import os
# Import Spark modules
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Domain Scraper") \
    .getOrCreate()

#### create a spark data frame

# constants
BASE_URL = "https://www.domain.com.au"
N_PAGES = range(1, 2)  # Update this to your liking

#Scrape suburb from the address
def extract_suburb(address: str) -> str:
    """Extract the suburb name from the property address."""
    match = re.search(r'(?<=, )\w+', address)
    if match:
        return match.group(0)
    return "Unknown"


def start_scrape() -> None:
    """ Function that scrapes https://www.domain.com.au and outputs the data into a json file

    Parameters:
    None

    Returns:
    None
    """
    schema = StructType([
    StructField("url", StringType(), True),
    StructField("name", StringType(), True),
    StructField("cost_text", StringType(), True),
    StructField("rooms", StringType(), True),
    StructField("desc", StringType(), True),
    StructField("parking", StringType(), True),
    StructField("street", StringType(), True),
    StructField("suburb", StringType(), True),
    StructField("postcode", StringType(), True),
    StructField("propertyType", StringType(), True),
    StructField("school", StringType(), True),
    StructField("features", StringType(), True),
    ])

    # begin code
    url_links = []
    property_metadata = defaultdict(dict)
    sdf = spark.createDataFrame([],schema)
    

    # generate list of urls to visit
    for page in N_PAGES:
        url = BASE_URL + f"/rent/?excludedeposittaken=1&state=vic&page={page}"
        print(f"Visiting {url}")
        bs_object = BeautifulSoup(urlopen(Request(url, headers={'User-Agent': "PostmanRuntime/7.6.0"})), "lxml")

        # find the unordered list (ul) elements which are the results, then
        # find all href (a) tags that are from the base_url website.
        index_links = bs_object.find("ul", {"data-testid": "results"}).findAll(
            "a", href=re.compile(f"{BASE_URL}/*")  # the `*` denotes wildcard any
        )

        for link in index_links:
            # if it's a property address, add it to the list
            if 'address' in link.get('class', []):
                url_links.append(link['href'])

    # for each url, scrape some basic metadata
    pbar = tqdm(url_links)
    success_count, total_count = 0, 0

    for property_url in pbar:
        try:
            bs_object = BeautifulSoup(urlopen(Request(property_url, headers={'User-Agent': "PostmanRuntime/7.6.0"})), "lxml")
            total_count += 1

            property_page = urlopen(Request(property_url, headers={'User-Agent': "PostmanRuntime/7.6.0"}))
            property_soup = BeautifulSoup(property_page, "lxml")

            # looks for the header class to get property name
            property_metadata[property_url]['name'] = bs_object.find("h1", {"class": "css-164r41r"}).text.strip()

            # looks for the div containing a summary title for cost
            property_metadata[property_url]['cost_text'] = bs_object.find(
                "div", {"data-testid": "listing-details__summary-title"}
            ).text.strip()


            # get rooms and parking
            rooms = bs_object.find("div", {"data-testid": "property-features"}).findAll(
                "span", {"data-testid": "property-features-text-container"}
            )

            # rooms
            property_metadata[property_url]['rooms'] = ", ".join(
                [re.findall(r'\d+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'Bed' in feature.text or 'Bath' in feature.text]
            )

            # parking
            property_metadata[property_url]['parking'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'Parking' in feature.text]
            )

            # desc
            property_metadata[property_url]['desc'] = ", ".join(
                [re.findall(r'\d+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'desc' in feature.text]
            )
            
            # listingID:
            property_metadata[property_url]['listingid'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'listingId' in feature.text]
            )

            # street:
            property_metadata[property_url]['street'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'street' in feature.text]
            )

            # suburb:
            property_metadata[property_url]['suburb'] = extract_suburb(property_metadata[property_url]['name'])

            
            # postcode:
            property_metadata[property_url]['postcode'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'postcode' in feature.text]
            )

            # property type:
            property_metadata[property_url]['propertyType'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'apartment' in feature.text 
                 or 'unit' in feature.text or 'house' in feature.text or 'flat' in feature.text]
            )

            # schools:
            property_metadata[property_url]['school'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'school' in feature.text]
            )

            # features:
            property_metadata[property_url]['features'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'feature' in feature.text]
            )

            # loanfinder:
            property_metadata[property_url]['loan'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'loan' in feature.text]
            )

            # listingSummary:
            property_metadata[property_url]['listingsummary'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'summary' in feature.text]
            )

            # suburb insights:
            property_metadata[property_url]['suburbInsights'] = ", ".join(
                [re.findall(r'\S+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'suburbInsights' in feature.text]
            )

            # property description
            property_metadata[property_url]['desc'] = bs_object.find("p").text.strip() if bs_object.find("p") else "N/A"


            # Scrape property description
            property_metadata[property_url]['desc'] = re.sub(r'<br\/>', '\n', str(property_soup.find("p"))).strip('</p>')
           
            """
            # Write each row to the CSV
            writer.writerow([
                property_url,
                property_metadata[property_url]['name'],
                property_metadata[property_url]['cost_text'],
                property_metadata[property_url]['rooms'],
                property_metadata[property_url]['parking'],
                property_metadata[property_url]['desc'],
                property_metadata[property_url]['listingid'],
                property_metadata[property_url]['street'],
                property_metadata[property_url]['suburb'],
                property_metadata[property_url]['postcode'],
                property_metadata[property_url]['propertyType'],
                property_metadata[property_url]['school'],
                property_metadata[property_url]['features'],
                property_metadata[property_url]['loan'],
                property_metadata[property_url]['listingsummary'],
                property_metadata[property_url]['suburbInsights']
            ])
            """
            success_count += 1
            temp_sdf = spark.createDataFrame(property_metadata)
            sdf.union(temp_sdf)

        except AttributeError:
            print(f"Issue with {property_url}")

        pbar.set_description(f"{(success_count / total_count * 100):.0f}% successful")

        # output to example json in data/raw/
    with open('../data/raw/example.json', 'w') as f:
        dump(property_metadata, f)

def convert_to_parquet(filepath: str, output_path: str) -> None:
    """ Function converts a json file into a parquet file

    Parameters:
    filepath (str): the filepath that locates our json data

    output_path (str): the filepath that we will place our new parquet file into

    Returns:
    None
    """
    with open(filepath) as f:
        data = load(f)

    new_data = change_json_format(data)

    # conversion from json -> dataframe -> parquet
    df = pd.DataFrame(new_data)
    df.to_parquet(output_path, engine='pyarrow')

    delete_json_file(filepath)

# function that changes the formatting of the json file
def change_json_format(data: dict) -> dict:
    """ Function grabs the renames the json keys to the words after the last backslash in the url and adds the url as an item

    Parameters:
    data (dict): json dictionary we are changing

    Returns:
    dict: our new json dictionary
    
    """
    new_data = {}
    for i in data.keys():
        new_name = i.rsplit('/', 1)[-1]
        new_data[new_name] = data[i]
        new_data[new_name]["href"] = i
    return new_data

def delete_json_file(filepath: str) -> None:
    """ Function deletes the json file we are converting from

    Parameters:
    filepath (string): filepath to the json file we are deleting

    Returns:
    None
    """
    try:
        os.remove(filepath)
        print(f"File '{filepath}' deleted successfully")
    except FileNotFoundError:
        print(f"File '{filepath}' not found")
    except PermissionError:
        print(f"Permission denied: '{filepath}'")
    except Exception as e:
        print(f"An error occurred: {e}")

24/09/09 22:40:34 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [71]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

from pyarrow import json
import pyarrow.parquet as pq

24/09/09 22:40:34 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [72]:
def convert_to_parquet(filepath: str, output_path: str) -> None:
    """ Function converts a JSON file into a parquet file """
    with open(filepath) as f:
        data = load(f)

    new_data = change_json_format(data)

    # Conversion from JSON -> DataFrame -> Parquet
    df = pd.DataFrame(new_data)
    df.to_parquet(output_path, engine='pyarrow')

    delete_json_file(filepath)

def change_json_format(data: dict) -> dict:
    """ Function renames JSON keys and adds the URL as an item """
    new_data = {}
    for i in data.keys():
        new_name = i.rsplit('/', 1)[-1]
        new_data[new_name] = data[i]
        new_data[new_name]["href"] = i
    return new_data

def delete_json_file(filepath: str) -> None:
    """ Function deletes the JSON file """
    try:
        os.remove(filepath)
        print(f"File '{filepath}' deleted successfully")
    except FileNotFoundError:
        print(f"File '{filepath}' not found")
    except PermissionError:
        print(f"Permission denied: '{filepath}'")
    except Exception as e:
        print(f"An error occurred: {e}")

In [73]:
#Method 2 : create a new spark df with the values that we have just read in. append it to the global sdf
# sdf was correctly initialised. Had problems with appending to the sdf

# built-in imports
import re
from json import dump
from tqdm import tqdm
from collections import defaultdict
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

# Initialize Spark session
spark = SparkSession.builder.appName("PropertyScraper").getOrCreate()

# constants
BASE_URL = "https://www.domain.com.au"
N_PAGES = range(1, 2)  # Update this to your liking

def start_scrape():
    """Function that scrapes https://www.domain.com.au and outputs the data into a JSON file"""

# This is the one that works
    schema = StructType([
        StructField("url", StringType(), True),
        StructField("name", StringType(), True),
        StructField("cost_text", StringType(), True),
        StructField("rooms", StringType(), True),
        
    ])

    url_links = []
    property_metadata = spark.createDataFrame([], schema)

    # Generate list of URLs to visit
    for page in N_PAGES:
        url = BASE_URL + f"/rent/?excludedeposittaken=1&state=vic&page={page}"
        print(f"Visiting {url}")
        bs_object = BeautifulSoup(urlopen(Request(url, headers={'User-Agent': "PostmanRuntime/7.6.0"})), "lxml")

        # Find the unordered list (ul) elements which are the results, then
        # find all href (a) tags that are from the base_url website.
        index_links = bs_object.find("ul", {"data-testid": "results"}).findAll(
            "a", href=re.compile(f"{BASE_URL}/*")  # the `*` denotes wildcard any
        )

        for link in index_links:
            # If it's a property address, add it to the list
            if 'address' in link.get('class', []):
                url_links.append(link['href'])

    # For each URL, scrape some basic metadata
    pbar = tqdm(url_links)
    success_count, total_count = 0, 0

    for property_url in pbar:
        try:
            bs_object = BeautifulSoup(urlopen(Request(property_url, headers={'User-Agent': "PostmanRuntime/7.6.0"})), "lxml")
            total_count += 1

            # Get property name
            name = bs_object.find("h1", {"class": "css-164r41r"}).text.strip()

            # Get cost text
            cost_text = bs_object.find("div", {"data-testid": "listing-details__summary-title"}).text.strip()

            # Get rooms
            rooms = bs_object.find("div", {"data-testid": "property-features"}).findAll(
                "span", {"data-testid": "property-features-text-container"}
            )
            rooms = ", ".join([re.findall(r'\d+\s[A-Za-z]+', feature.text)[0] for feature in rooms if 'Bed' in feature.text or 'Bath' in feature.text])

            # Create a row and append it to the DataFrame
            row = [(property_url, name, cost_text, rooms)]
            row_df = spark.createDataFrame(row, schema)
            property_metadata = property_metadata.union(row_df)
            success_count += 1

        except AttributeError:
            print(f"Issue with {property_url}")

        pbar.set_description(f"{(success_count / total_count * 100):.0f}% successful")
        property_metadata.show()

    # Output to JSON file
    property_metadata.write.json('../data/raw/work.json', mode='overwrite')

# Start scraping
start_scrape()


# # Output to example JSON in data/raw/
#     with open('../data/raw/work', 'w') as f:
#         dump(property_metadata, f)


24/09/09 22:40:34 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


Visiting https://www.domain.com.au/rent/?excludedeposittaken=1&state=vic&page=1


100% successful:   5%|▍         | 1/21 [00:01<00:23,  1.18s/it]

+--------------------+--------------------+---------+---------------+
|                 url|                name|cost_text|          rooms|
+--------------------+--------------------+---------+---------------+
|https://www.domai...|2/184 Timor Stree...|     $550|2 Beds, 2 Baths|
+--------------------+--------------------+---------+---------------+



100% successful:  10%|▉         | 2/21 [00:02<00:29,  1.54s/it]

+--------------------+--------------------+---------+---------------+
|                 url|                name|cost_text|          rooms|
+--------------------+--------------------+---------+---------------+
|https://www.domai...|2/184 Timor Stree...|     $550|2 Beds, 2 Baths|
|https://www.domai...|G03/50 Seymour Gr...|  $680.00|2 Beds, 2 Baths|
+--------------------+--------------------+---------+---------------+



100% successful:  14%|█▍        | 3/21 [00:04<00:27,  1.55s/it]

+--------------------+--------------------+---------+---------------+
|                 url|                name|cost_text|          rooms|
+--------------------+--------------------+---------+---------------+
|https://www.domai...|2/184 Timor Stree...|     $550|2 Beds, 2 Baths|
|https://www.domai...|G03/50 Seymour Gr...|  $680.00|2 Beds, 2 Baths|
|https://www.domai...|G03/129 Douglas P...|  $650.00|2 Beds, 2 Baths|
+--------------------+--------------------+---------+---------------+



100% successful:  19%|█▉        | 4/21 [00:06<00:29,  1.72s/it]

+--------------------+--------------------+---------+---------------+
|                 url|                name|cost_text|          rooms|
+--------------------+--------------------+---------+---------------+
|https://www.domai...|2/184 Timor Stree...|     $550|2 Beds, 2 Baths|
|https://www.domai...|G03/50 Seymour Gr...|  $680.00|2 Beds, 2 Baths|
|https://www.domai...|G03/129 Douglas P...|  $650.00|2 Beds, 2 Baths|
|https://www.domai...|41 Rainsford Driv...|   $500pw| 3 Beds, 1 Bath|
+--------------------+--------------------+---------+---------------+



100% successful:  24%|██▍       | 5/21 [00:08<00:29,  1.83s/it]

+--------------------+--------------------+---------+---------------+
|                 url|                name|cost_text|          rooms|
+--------------------+--------------------+---------+---------------+
|https://www.domai...|2/184 Timor Stree...|     $550|2 Beds, 2 Baths|
|https://www.domai...|G03/50 Seymour Gr...|  $680.00|2 Beds, 2 Baths|
|https://www.domai...|G03/129 Douglas P...|  $650.00|2 Beds, 2 Baths|
|https://www.domai...|41 Rainsford Driv...|   $500pw| 3 Beds, 1 Bath|
|https://www.domai...|7/24 Fitzgerald S...|  $550 pw|  1 Bed, 1 Bath|
+--------------------+--------------------+---------+---------------+



100% successful:  29%|██▊       | 6/21 [00:10<00:28,  1.87s/it]                 

+--------------------+--------------------+---------+---------------+
|                 url|                name|cost_text|          rooms|
+--------------------+--------------------+---------+---------------+
|https://www.domai...|2/184 Timor Stree...|     $550|2 Beds, 2 Baths|
|https://www.domai...|G03/50 Seymour Gr...|  $680.00|2 Beds, 2 Baths|
|https://www.domai...|G03/129 Douglas P...|  $650.00|2 Beds, 2 Baths|
|https://www.domai...|41 Rainsford Driv...|   $500pw| 3 Beds, 1 Bath|
|https://www.domai...|7/24 Fitzgerald S...|  $550 pw|  1 Bed, 1 Bath|
|https://www.domai...|1/25 Byron Street...|  $650 pw| 2 Beds, 1 Bath|
+--------------------+--------------------+---------+---------------+



100% successful:  33%|███▎      | 7/21 [00:12<00:27,  2.00s/it]                 

+--------------------+--------------------+---------+---------------+
|                 url|                name|cost_text|          rooms|
+--------------------+--------------------+---------+---------------+
|https://www.domai...|2/184 Timor Stree...|     $550|2 Beds, 2 Baths|
|https://www.domai...|G03/50 Seymour Gr...|  $680.00|2 Beds, 2 Baths|
|https://www.domai...|G03/129 Douglas P...|  $650.00|2 Beds, 2 Baths|
|https://www.domai...|41 Rainsford Driv...|   $500pw| 3 Beds, 1 Bath|
|https://www.domai...|7/24 Fitzgerald S...|  $550 pw|  1 Bed, 1 Bath|
|https://www.domai...|1/25 Byron Street...|  $650 pw| 2 Beds, 1 Bath|
|https://www.domai...|129/631 Victoria ...|     $520|  1 Bed, 1 Bath|
+--------------------+--------------------+---------+---------------+



100% successful:  38%|███▊      | 8/21 [00:15<00:28,  2.17s/it]                 

+--------------------+--------------------+-------------+---------------+
|                 url|                name|    cost_text|          rooms|
+--------------------+--------------------+-------------+---------------+
|https://www.domai...|2/184 Timor Stree...|         $550|2 Beds, 2 Baths|
|https://www.domai...|G03/50 Seymour Gr...|      $680.00|2 Beds, 2 Baths|
|https://www.domai...|G03/129 Douglas P...|      $650.00|2 Beds, 2 Baths|
|https://www.domai...|41 Rainsford Driv...|       $500pw| 3 Beds, 1 Bath|
|https://www.domai...|7/24 Fitzgerald S...|      $550 pw|  1 Bed, 1 Bath|
|https://www.domai...|1/25 Byron Street...|      $650 pw| 2 Beds, 1 Bath|
|https://www.domai...|129/631 Victoria ...|         $520|  1 Bed, 1 Bath|
|https://www.domai...|114/18 Queen Stre...|$450 per week|  1 Bed, 1 Bath|
+--------------------+--------------------+-------------+---------------+



100% successful:  43%|████▎     | 9/21 [00:18<00:28,  2.40s/it]                 

+--------------------+--------------------+-------------+---------------+
|                 url|                name|    cost_text|          rooms|
+--------------------+--------------------+-------------+---------------+
|https://www.domai...|2/184 Timor Stree...|         $550|2 Beds, 2 Baths|
|https://www.domai...|G03/50 Seymour Gr...|      $680.00|2 Beds, 2 Baths|
|https://www.domai...|G03/129 Douglas P...|      $650.00|2 Beds, 2 Baths|
|https://www.domai...|41 Rainsford Driv...|       $500pw| 3 Beds, 1 Bath|
|https://www.domai...|7/24 Fitzgerald S...|      $550 pw|  1 Bed, 1 Bath|
|https://www.domai...|1/25 Byron Street...|      $650 pw| 2 Beds, 1 Bath|
|https://www.domai...|129/631 Victoria ...|         $520|  1 Bed, 1 Bath|
|https://www.domai...|114/18 Queen Stre...|$450 per week|  1 Bed, 1 Bath|
|https://www.domai...|418/627 Victoria ...|         $740|2 Beds, 2 Baths|
+--------------------+--------------------+-------------+---------------+



100% successful:  48%|████▊     | 10/21 [00:21<00:28,  2.61s/it]                

+--------------------+--------------------+-------------+---------------+
|                 url|                name|    cost_text|          rooms|
+--------------------+--------------------+-------------+---------------+
|https://www.domai...|2/184 Timor Stree...|         $550|2 Beds, 2 Baths|
|https://www.domai...|G03/50 Seymour Gr...|      $680.00|2 Beds, 2 Baths|
|https://www.domai...|G03/129 Douglas P...|      $650.00|2 Beds, 2 Baths|
|https://www.domai...|41 Rainsford Driv...|       $500pw| 3 Beds, 1 Bath|
|https://www.domai...|7/24 Fitzgerald S...|      $550 pw|  1 Bed, 1 Bath|
|https://www.domai...|1/25 Byron Street...|      $650 pw| 2 Beds, 1 Bath|
|https://www.domai...|129/631 Victoria ...|         $520|  1 Bed, 1 Bath|
|https://www.domai...|114/18 Queen Stre...|$450 per week|  1 Bed, 1 Bath|
|https://www.domai...|418/627 Victoria ...|         $740|2 Beds, 2 Baths|
|https://www.domai...|105/88 Beaconsfie...|$1,200 weekly|3 Beds, 2 Baths|
+--------------------+----------------

100% successful:  52%|█████▏    | 11/21 [00:24<00:27,  2.75s/it]                

+--------------------+--------------------+-------------+---------------+
|                 url|                name|    cost_text|          rooms|
+--------------------+--------------------+-------------+---------------+
|https://www.domai...|2/184 Timor Stree...|         $550|2 Beds, 2 Baths|
|https://www.domai...|G03/50 Seymour Gr...|      $680.00|2 Beds, 2 Baths|
|https://www.domai...|G03/129 Douglas P...|      $650.00|2 Beds, 2 Baths|
|https://www.domai...|41 Rainsford Driv...|       $500pw| 3 Beds, 1 Bath|
|https://www.domai...|7/24 Fitzgerald S...|      $550 pw|  1 Bed, 1 Bath|
|https://www.domai...|1/25 Byron Street...|      $650 pw| 2 Beds, 1 Bath|
|https://www.domai...|129/631 Victoria ...|         $520|  1 Bed, 1 Bath|
|https://www.domai...|114/18 Queen Stre...|$450 per week|  1 Bed, 1 Bath|
|https://www.domai...|418/627 Victoria ...|         $740|2 Beds, 2 Baths|
|https://www.domai...|105/88 Beaconsfie...|$1,200 weekly|3 Beds, 2 Baths|
|https://www.domai...|206/38 Inkerman 

100% successful:  57%|█████▋    | 12/21 [00:27<00:26,  2.98s/it]                

+--------------------+--------------------+-------------+---------------+
|                 url|                name|    cost_text|          rooms|
+--------------------+--------------------+-------------+---------------+
|https://www.domai...|2/184 Timor Stree...|         $550|2 Beds, 2 Baths|
|https://www.domai...|G03/50 Seymour Gr...|      $680.00|2 Beds, 2 Baths|
|https://www.domai...|G03/129 Douglas P...|      $650.00|2 Beds, 2 Baths|
|https://www.domai...|41 Rainsford Driv...|       $500pw| 3 Beds, 1 Bath|
|https://www.domai...|7/24 Fitzgerald S...|      $550 pw|  1 Bed, 1 Bath|
|https://www.domai...|1/25 Byron Street...|      $650 pw| 2 Beds, 1 Bath|
|https://www.domai...|129/631 Victoria ...|         $520|  1 Bed, 1 Bath|
|https://www.domai...|114/18 Queen Stre...|$450 per week|  1 Bed, 1 Bath|
|https://www.domai...|418/627 Victoria ...|         $740|2 Beds, 2 Baths|
|https://www.domai...|105/88 Beaconsfie...|$1,200 weekly|3 Beds, 2 Baths|
|https://www.domai...|206/38 Inkerman 

100% successful:  62%|██████▏   | 13/21 [00:31<00:24,  3.11s/it]                

+--------------------+--------------------+-------------+---------------+
|                 url|                name|    cost_text|          rooms|
+--------------------+--------------------+-------------+---------------+
|https://www.domai...|2/184 Timor Stree...|         $550|2 Beds, 2 Baths|
|https://www.domai...|G03/50 Seymour Gr...|      $680.00|2 Beds, 2 Baths|
|https://www.domai...|G03/129 Douglas P...|      $650.00|2 Beds, 2 Baths|
|https://www.domai...|41 Rainsford Driv...|       $500pw| 3 Beds, 1 Bath|
|https://www.domai...|7/24 Fitzgerald S...|      $550 pw|  1 Bed, 1 Bath|
|https://www.domai...|1/25 Byron Street...|      $650 pw| 2 Beds, 1 Bath|
|https://www.domai...|129/631 Victoria ...|         $520|  1 Bed, 1 Bath|
|https://www.domai...|114/18 Queen Stre...|$450 per week|  1 Bed, 1 Bath|
|https://www.domai...|418/627 Victoria ...|         $740|2 Beds, 2 Baths|
|https://www.domai...|105/88 Beaconsfie...|$1,200 weekly|3 Beds, 2 Baths|
|https://www.domai...|206/38 Inkerman 

100% successful:  67%|██████▋   | 14/21 [00:35<00:24,  3.49s/it]                

+--------------------+--------------------+-------------+---------------+
|                 url|                name|    cost_text|          rooms|
+--------------------+--------------------+-------------+---------------+
|https://www.domai...|2/184 Timor Stree...|         $550|2 Beds, 2 Baths|
|https://www.domai...|G03/50 Seymour Gr...|      $680.00|2 Beds, 2 Baths|
|https://www.domai...|G03/129 Douglas P...|      $650.00|2 Beds, 2 Baths|
|https://www.domai...|41 Rainsford Driv...|       $500pw| 3 Beds, 1 Bath|
|https://www.domai...|7/24 Fitzgerald S...|      $550 pw|  1 Bed, 1 Bath|
|https://www.domai...|1/25 Byron Street...|      $650 pw| 2 Beds, 1 Bath|
|https://www.domai...|129/631 Victoria ...|         $520|  1 Bed, 1 Bath|
|https://www.domai...|114/18 Queen Stre...|$450 per week|  1 Bed, 1 Bath|
|https://www.domai...|418/627 Victoria ...|         $740|2 Beds, 2 Baths|
|https://www.domai...|105/88 Beaconsfie...|$1,200 weekly|3 Beds, 2 Baths|
|https://www.domai...|206/38 Inkerman 

100% successful:  71%|███████▏  | 15/21 [00:40<00:23,  3.93s/it]                

+--------------------+--------------------+-------------+---------------+
|                 url|                name|    cost_text|          rooms|
+--------------------+--------------------+-------------+---------------+
|https://www.domai...|2/184 Timor Stree...|         $550|2 Beds, 2 Baths|
|https://www.domai...|G03/50 Seymour Gr...|      $680.00|2 Beds, 2 Baths|
|https://www.domai...|G03/129 Douglas P...|      $650.00|2 Beds, 2 Baths|
|https://www.domai...|41 Rainsford Driv...|       $500pw| 3 Beds, 1 Bath|
|https://www.domai...|7/24 Fitzgerald S...|      $550 pw|  1 Bed, 1 Bath|
|https://www.domai...|1/25 Byron Street...|      $650 pw| 2 Beds, 1 Bath|
|https://www.domai...|129/631 Victoria ...|         $520|  1 Bed, 1 Bath|
|https://www.domai...|114/18 Queen Stre...|$450 per week|  1 Bed, 1 Bath|
|https://www.domai...|418/627 Victoria ...|         $740|2 Beds, 2 Baths|
|https://www.domai...|105/88 Beaconsfie...|$1,200 weekly|3 Beds, 2 Baths|
|https://www.domai...|206/38 Inkerman 

100% successful:  76%|███████▌  | 16/21 [00:45<00:21,  4.32s/it]                

+--------------------+--------------------+-------------+---------------+
|                 url|                name|    cost_text|          rooms|
+--------------------+--------------------+-------------+---------------+
|https://www.domai...|2/184 Timor Stree...|         $550|2 Beds, 2 Baths|
|https://www.domai...|G03/50 Seymour Gr...|      $680.00|2 Beds, 2 Baths|
|https://www.domai...|G03/129 Douglas P...|      $650.00|2 Beds, 2 Baths|
|https://www.domai...|41 Rainsford Driv...|       $500pw| 3 Beds, 1 Bath|
|https://www.domai...|7/24 Fitzgerald S...|      $550 pw|  1 Bed, 1 Bath|
|https://www.domai...|1/25 Byron Street...|      $650 pw| 2 Beds, 1 Bath|
|https://www.domai...|129/631 Victoria ...|         $520|  1 Bed, 1 Bath|
|https://www.domai...|114/18 Queen Stre...|$450 per week|  1 Bed, 1 Bath|
|https://www.domai...|418/627 Victoria ...|         $740|2 Beds, 2 Baths|
|https://www.domai...|105/88 Beaconsfie...|$1,200 weekly|3 Beds, 2 Baths|
|https://www.domai...|206/38 Inkerman 

100% successful:  81%|████████  | 17/21 [00:50<00:17,  4.46s/it]                

+--------------------+--------------------+-------------+---------------+
|                 url|                name|    cost_text|          rooms|
+--------------------+--------------------+-------------+---------------+
|https://www.domai...|2/184 Timor Stree...|         $550|2 Beds, 2 Baths|
|https://www.domai...|G03/50 Seymour Gr...|      $680.00|2 Beds, 2 Baths|
|https://www.domai...|G03/129 Douglas P...|      $650.00|2 Beds, 2 Baths|
|https://www.domai...|41 Rainsford Driv...|       $500pw| 3 Beds, 1 Bath|
|https://www.domai...|7/24 Fitzgerald S...|      $550 pw|  1 Bed, 1 Bath|
|https://www.domai...|1/25 Byron Street...|      $650 pw| 2 Beds, 1 Bath|
|https://www.domai...|129/631 Victoria ...|         $520|  1 Bed, 1 Bath|
|https://www.domai...|114/18 Queen Stre...|$450 per week|  1 Bed, 1 Bath|
|https://www.domai...|418/627 Victoria ...|         $740|2 Beds, 2 Baths|
|https://www.domai...|105/88 Beaconsfie...|$1,200 weekly|3 Beds, 2 Baths|
|https://www.domai...|206/38 Inkerman 

100% successful:  86%|████████▌ | 18/21 [00:56<00:14,  4.74s/it]                

+--------------------+--------------------+-------------+---------------+
|                 url|                name|    cost_text|          rooms|
+--------------------+--------------------+-------------+---------------+
|https://www.domai...|2/184 Timor Stree...|         $550|2 Beds, 2 Baths|
|https://www.domai...|G03/50 Seymour Gr...|      $680.00|2 Beds, 2 Baths|
|https://www.domai...|G03/129 Douglas P...|      $650.00|2 Beds, 2 Baths|
|https://www.domai...|41 Rainsford Driv...|       $500pw| 3 Beds, 1 Bath|
|https://www.domai...|7/24 Fitzgerald S...|      $550 pw|  1 Bed, 1 Bath|
|https://www.domai...|1/25 Byron Street...|      $650 pw| 2 Beds, 1 Bath|
|https://www.domai...|129/631 Victoria ...|         $520|  1 Bed, 1 Bath|
|https://www.domai...|114/18 Queen Stre...|$450 per week|  1 Bed, 1 Bath|
|https://www.domai...|418/627 Victoria ...|         $740|2 Beds, 2 Baths|
|https://www.domai...|105/88 Beaconsfie...|$1,200 weekly|3 Beds, 2 Baths|
|https://www.domai...|206/38 Inkerman 

100% successful:  90%|█████████ | 19/21 [01:01<00:10,  5.03s/it]                

+--------------------+--------------------+-------------+---------------+
|                 url|                name|    cost_text|          rooms|
+--------------------+--------------------+-------------+---------------+
|https://www.domai...|2/184 Timor Stree...|         $550|2 Beds, 2 Baths|
|https://www.domai...|G03/50 Seymour Gr...|      $680.00|2 Beds, 2 Baths|
|https://www.domai...|G03/129 Douglas P...|      $650.00|2 Beds, 2 Baths|
|https://www.domai...|41 Rainsford Driv...|       $500pw| 3 Beds, 1 Bath|
|https://www.domai...|7/24 Fitzgerald S...|      $550 pw|  1 Bed, 1 Bath|
|https://www.domai...|1/25 Byron Street...|      $650 pw| 2 Beds, 1 Bath|
|https://www.domai...|129/631 Victoria ...|         $520|  1 Bed, 1 Bath|
|https://www.domai...|114/18 Queen Stre...|$450 per week|  1 Bed, 1 Bath|
|https://www.domai...|418/627 Victoria ...|         $740|2 Beds, 2 Baths|
|https://www.domai...|105/88 Beaconsfie...|$1,200 weekly|3 Beds, 2 Baths|
|https://www.domai...|206/38 Inkerman 

100% successful:  95%|█████████▌| 20/21 [01:07<00:05,  5.18s/it]                

+--------------------+--------------------+--------------------+---------------+
|                 url|                name|           cost_text|          rooms|
+--------------------+--------------------+--------------------+---------------+
|https://www.domai...|2/184 Timor Stree...|                $550|2 Beds, 2 Baths|
|https://www.domai...|G03/50 Seymour Gr...|             $680.00|2 Beds, 2 Baths|
|https://www.domai...|G03/129 Douglas P...|             $650.00|2 Beds, 2 Baths|
|https://www.domai...|41 Rainsford Driv...|              $500pw| 3 Beds, 1 Bath|
|https://www.domai...|7/24 Fitzgerald S...|             $550 pw|  1 Bed, 1 Bath|
|https://www.domai...|1/25 Byron Street...|             $650 pw| 2 Beds, 1 Bath|
|https://www.domai...|129/631 Victoria ...|                $520|  1 Bed, 1 Bath|
|https://www.domai...|114/18 Queen Stre...|       $450 per week|  1 Bed, 1 Bath|
|https://www.domai...|418/627 Victoria ...|                $740|2 Beds, 2 Baths|
|https://www.domai...|105/88

100% successful: 100%|██████████| 21/21 [01:13<00:00,  3.48s/it]                


+--------------------+--------------------+--------------------+---------------+
|                 url|                name|           cost_text|          rooms|
+--------------------+--------------------+--------------------+---------------+
|https://www.domai...|2/184 Timor Stree...|                $550|2 Beds, 2 Baths|
|https://www.domai...|G03/50 Seymour Gr...|             $680.00|2 Beds, 2 Baths|
|https://www.domai...|G03/129 Douglas P...|             $650.00|2 Beds, 2 Baths|
|https://www.domai...|41 Rainsford Driv...|              $500pw| 3 Beds, 1 Bath|
|https://www.domai...|7/24 Fitzgerald S...|             $550 pw|  1 Bed, 1 Bath|
|https://www.domai...|1/25 Byron Street...|             $650 pw| 2 Beds, 1 Bath|
|https://www.domai...|129/631 Victoria ...|                $520|  1 Bed, 1 Bath|
|https://www.domai...|114/18 Queen Stre...|       $450 per week|  1 Bed, 1 Bath|
|https://www.domai...|418/627 Victoria ...|                $740|2 Beds, 2 Baths|
|https://www.domai...|105/88

                                                                                

In [74]:
import pandas as pd
import os

# Define the folder path containing JSON files
folder_path = '../data/raw/work.json'

# List all files in the directory
json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]

# Initialize an empty list to store DataFrames
dataframes = []

# Read each JSON file into a DataFrame
for file in json_files:
    file_path = os.path.join(folder_path, file)
    # Read JSON file
    df = pd.read_json(file_path, lines=True)
    # Append the DataFrame to the list
    dataframes.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

# Display the combined DataFrame
print(combined_df.head(12))


                                                  url  \
0   https://www.domain.com.au/17-henderson-street-...   
1   https://www.domain.com.au/206-38-inkerman-stre...   
2   https://www.domain.com.au/1-7-hennessy-street-...   
3   https://www.domain.com.au/1-25-byron-street-el...   
4   https://www.domain.com.au/41-rainsford-drive-n...   
5   https://www.domain.com.au/418-627-victoria-str...   
6   https://www.domain.com.au/6a-and-6b-312-320-mo...   
7   https://www.domain.com.au/613-70-batesford-roa...   
8   https://www.domain.com.au/105-88-beaconsfield-...   
9   https://www.domain.com.au/409-10-burnley-stree...   
10  https://www.domain.com.au/6-56-stockade-avenue...   
11  https://www.domain.com.au/114-18-queen-street-...   

                                                 name  \
0        17 Henderson Street, Brunswick West VIC 3055   
1           206/38 Inkerman Street, St Kilda VIC 3182   
2      1/7 Hennessy Street Street, Chadstone VIC 3148   
3                  1/25 Byron 