In [104]:
import json
import os
import re
import pandas as pd
import geopandas as gpd

In [105]:
DATA_DIRECTORY = "../scrape_data/"

In [106]:
data_files = [file for file in os.listdir(DATA_DIRECTORY) if file.endswith(".json")]

In [107]:
data_flat = []

In [108]:
def extract_number(text):
    match = re.search(r'Bond\s\$(\d+)', text)
    if match:
        return int(match.group(1))
    return None

In [109]:
def flatten_row(row):
    try:
        row_flat = {"url": row["url"], "price": row["price"], "address": row["address"],
                    "property_type": row["property_type"], "latitude": row["latitude"], "longitude": row["longitude"]}

        for feature in row["features"]:
            if len(feature.split()) == 2:
                feature_value, feature_name, *_ = feature.replace("- ", "").split()
                row_flat[feature_name] = feature_value

        for summary_item in row["summary"]:
            row_flat["bond"] = extract_number(summary_item)

        return row_flat
    except Exception as e:
        print(e)
        return None

In [110]:
for file in data_files:
    with open(DATA_DIRECTORY + file) as f:
        content = json.load(f)
        for row in content:
            flat_row = flatten_row(row)
            if flat_row is not None:
                data_flat.append(flat_row)

print(len(data_flat))

'url'
9828


In [111]:
df = pd.DataFrame(data_flat)
df.head()

Unnamed: 0,url,price,address,property_type,latitude,longitude,Beds,Baths,Parking,bond,Bath,Bed
0,https://www.domain.com.au/10-allara-court-donv...,"$1,400.00","10 Allara Court, Donvale VIC 3111",Townhouse,-37.774273,145.181126,4,3.0,2,9125.0,,
1,https://www.domain.com.au/7-pine-ridge-donvale...,$750 per week,"7 Pine Ridge, Donvale VIC 3111",House,-37.791251,145.175649,4,2.0,−,3259.0,,
2,https://www.domain.com.au/20-mulsanne-way-donv...,$1300 per week,"20 Mulsanne Way, Donvale VIC 3111",House,-37.797232,145.181264,5,2.0,2,5649.0,,
3,https://www.domain.com.au/3-monterey-crescent-...,$825pw / $3585pcm,"3 Monterey Crescent, Donvale VIC 3111",House,-37.792402,145.174323,3,,5,3585.0,1.0,
4,https://www.domain.com.au/3-49-leslie-street-d...,$680.00,"3/49 Leslie Street, Donvale VIC 3111",Townhouse,-37.781012,145.180705,3,2.0,2,2955.0,,


In [112]:
df["Beds"] = df["Beds"].replace(["−"], 0)
df["Baths"] = df["Baths"].replace(["−"], 0)
df["Parking"] = df["Parking"].replace(["−"], 0)

df['Beds'] = df['Beds'].fillna(df['Bed'])
df = df.drop(columns=['Bed'])

df['Baths'] = df['Baths'].fillna(df['Bath'])
df = df.drop(columns=['Bath'])

df["Beds"] = pd.to_numeric(df["Beds"], errors='coerce')
df["Baths"] = pd.to_numeric(df["Baths"], errors='coerce')
df["Parking"] = pd.to_numeric(df["Parking"], errors='coerce')

In [113]:
# Extract first price after $ sign, allowing for optional commas or no commas
df['extracted_price'] = df['price'].str.extract(r'\$(\d+(?:,\d+)*(?:\.\d{2})?)')

In [114]:
def extract_all_numbers(text):
    # Find all sequences of digits, including decimals
    numbers = re.findall(r'\d+(?:\.\d+)?', text)
    # Return the first number if found, else None
    return numbers[0] if numbers else None

df['extracted_price'] = df['extracted_price'].fillna(df['price'].apply(extract_all_numbers))

In [115]:
# Remove commas if any, and convert to numeric
df['extracted_price'] = df['extracted_price'].str.replace(',', '').astype(float)

In [116]:
df

Unnamed: 0,url,price,address,property_type,latitude,longitude,Beds,Baths,Parking,bond,extracted_price
0,https://www.domain.com.au/10-allara-court-donv...,"$1,400.00","10 Allara Court, Donvale VIC 3111",Townhouse,-37.774273,145.181126,4.0,3.0,2.0,9125.0,1400.0
1,https://www.domain.com.au/7-pine-ridge-donvale...,$750 per week,"7 Pine Ridge, Donvale VIC 3111",House,-37.791251,145.175649,4.0,2.0,0.0,3259.0,750.0
2,https://www.domain.com.au/20-mulsanne-way-donv...,$1300 per week,"20 Mulsanne Way, Donvale VIC 3111",House,-37.797232,145.181264,5.0,2.0,2.0,5649.0,1300.0
3,https://www.domain.com.au/3-monterey-crescent-...,$825pw / $3585pcm,"3 Monterey Crescent, Donvale VIC 3111",House,-37.792402,145.174323,3.0,1.0,5.0,3585.0,825.0
4,https://www.domain.com.au/3-49-leslie-street-d...,$680.00,"3/49 Leslie Street, Donvale VIC 3111",Townhouse,-37.781012,145.180705,3.0,2.0,2.0,2955.0,680.0
...,...,...,...,...,...,...,...,...,...,...,...
9823,https://www.domain.com.au/brighton-vic-3186-96...,Fine Furnished Residences @ Westprecinct,Brighton VIC 3186,Apartment / Unit / Flat,-37.904400,144.999740,3.0,2.0,2.0,,
9824,https://www.domain.com.au/brighton-vic-3186-10...,Fine Furnished Residences @ Westprecinct,Brighton VIC 3186,Townhouse,-37.904400,144.999740,3.0,2.0,2.0,,
9825,https://www.domain.com.au/brighton-vic-3186-92...,Fine Furnished Residences @ Westprecinct,Brighton VIC 3186,Apartment / Unit / Flat,-37.904400,144.999740,3.0,2.0,2.0,,
9826,https://www.domain.com.au/brighton-vic-3186-91...,Fine Furnished Residences @ Westprecinct,Brighton VIC 3186,Apartment / Unit / Flat,-37.904400,144.999740,2.0,2.0,1.0,,


In [117]:
victoria_gdf = gpd.read_file('../data/landing/boundaries/Victoria/vic_dist_boundaries.shp')
victoria_gdf.head()

Unnamed: 0,sa2_code,sa2_name,chg_flag,chg_lbl,sa3_code,sa3_name,sa4_code,sa4_name,gcc_code,gcc_name,ste_code,ste_name,aus_code,aus_name,areasqkm,loci_uri,geometry
0,201011001,Alfredton,0,No change,20101,Ballarat,201,Ballarat,2RVIC,Rest of Vic.,2,Victoria,AUS,Australia,52.7109,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((143.78281 -37.56667, 143.75557 -37.5..."
1,201011002,Ballarat,0,No change,20101,Ballarat,201,Ballarat,2RVIC,Rest of Vic.,2,Victoria,AUS,Australia,12.3787,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((143.81896 -37.55583, 143.81644 -37.5..."
2,201011005,Buninyong,0,No change,20101,Ballarat,201,Ballarat,2RVIC,Rest of Vic.,2,Victoria,AUS,Australia,51.5855,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((143.8417 -37.61597, 143.84175 -37.61..."
3,201011006,Delacombe,0,No change,20101,Ballarat,201,Ballarat,2RVIC,Rest of Vic.,2,Victoria,AUS,Australia,34.1607,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((143.75049 -37.5912, 143.75044 -37.59..."
4,201011007,Smythes Creek,0,No change,20101,Ballarat,201,Ballarat,2RVIC,Rest of Vic.,2,Victoria,AUS,Australia,104.7274,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((143.73295 -37.62334, 143.73262 -37.6..."


In [118]:
# Merge df with victoria_gdf based on latitude and longitude
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude), crs=victoria_gdf.crs)
gdf = gpd.sjoin(gdf, victoria_gdf, how='left')
gdf.head()

Unnamed: 0,url,price,address,property_type,latitude,longitude,Beds,Baths,Parking,bond,...,sa4_code,sa4_name,gcc_code,gcc_name,ste_code,ste_name,aus_code,aus_name,areasqkm,loci_uri
0,https://www.domain.com.au/10-allara-court-donv...,"$1,400.00","10 Allara Court, Donvale VIC 3111",Townhouse,-37.774273,145.181126,4.0,3.0,2.0,9125.0,...,211,Melbourne - Outer East,2GMEL,Greater Melbourne,2,Victoria,AUS,Australia,20.8028,http://linked.data.gov.au/dataset/asgsed3/SA2/...
1,https://www.domain.com.au/7-pine-ridge-donvale...,$750 per week,"7 Pine Ridge, Donvale VIC 3111",House,-37.791251,145.175649,4.0,2.0,0.0,3259.0,...,211,Melbourne - Outer East,2GMEL,Greater Melbourne,2,Victoria,AUS,Australia,20.8028,http://linked.data.gov.au/dataset/asgsed3/SA2/...
2,https://www.domain.com.au/20-mulsanne-way-donv...,$1300 per week,"20 Mulsanne Way, Donvale VIC 3111",House,-37.797232,145.181264,5.0,2.0,2.0,5649.0,...,211,Melbourne - Outer East,2GMEL,Greater Melbourne,2,Victoria,AUS,Australia,20.8028,http://linked.data.gov.au/dataset/asgsed3/SA2/...
3,https://www.domain.com.au/3-monterey-crescent-...,$825pw / $3585pcm,"3 Monterey Crescent, Donvale VIC 3111",House,-37.792402,145.174323,3.0,1.0,5.0,3585.0,...,211,Melbourne - Outer East,2GMEL,Greater Melbourne,2,Victoria,AUS,Australia,20.8028,http://linked.data.gov.au/dataset/asgsed3/SA2/...
4,https://www.domain.com.au/3-49-leslie-street-d...,$680.00,"3/49 Leslie Street, Donvale VIC 3111",Townhouse,-37.781012,145.180705,3.0,2.0,2.0,2955.0,...,211,Melbourne - Outer East,2GMEL,Greater Melbourne,2,Victoria,AUS,Australia,20.8028,http://linked.data.gov.au/dataset/asgsed3/SA2/...


In [119]:
gdf["geometry_proj"] = gdf["geometry"].to_crs('EPSG:3112')
gdf["geometry_proj"]

0       POINT (989259.417 -4291898.087)
1        POINT (988607.824 -4293742.53)
2       POINT (989043.875 -4294451.666)
3       POINT (988479.464 -4293860.035)
4       POINT (989155.458 -4292643.926)
                     ...               
9823    POINT (971982.558 -4304953.329)
9824    POINT (971982.558 -4304953.329)
9825    POINT (971982.558 -4304953.329)
9826    POINT (971982.558 -4304953.329)
9827    POINT (971982.558 -4304953.329)
Name: geometry_proj, Length: 9828, dtype: geometry

In [120]:
gdf

Unnamed: 0,url,price,address,property_type,latitude,longitude,Beds,Baths,Parking,bond,...,sa4_name,gcc_code,gcc_name,ste_code,ste_name,aus_code,aus_name,areasqkm,loci_uri,geometry_proj
0,https://www.domain.com.au/10-allara-court-donv...,"$1,400.00","10 Allara Court, Donvale VIC 3111",Townhouse,-37.774273,145.181126,4.0,3.0,2.0,9125.0,...,Melbourne - Outer East,2GMEL,Greater Melbourne,2,Victoria,AUS,Australia,20.8028,http://linked.data.gov.au/dataset/asgsed3/SA2/...,POINT (989259.417 -4291898.087)
1,https://www.domain.com.au/7-pine-ridge-donvale...,$750 per week,"7 Pine Ridge, Donvale VIC 3111",House,-37.791251,145.175649,4.0,2.0,0.0,3259.0,...,Melbourne - Outer East,2GMEL,Greater Melbourne,2,Victoria,AUS,Australia,20.8028,http://linked.data.gov.au/dataset/asgsed3/SA2/...,POINT (988607.824 -4293742.53)
2,https://www.domain.com.au/20-mulsanne-way-donv...,$1300 per week,"20 Mulsanne Way, Donvale VIC 3111",House,-37.797232,145.181264,5.0,2.0,2.0,5649.0,...,Melbourne - Outer East,2GMEL,Greater Melbourne,2,Victoria,AUS,Australia,20.8028,http://linked.data.gov.au/dataset/asgsed3/SA2/...,POINT (989043.875 -4294451.666)
3,https://www.domain.com.au/3-monterey-crescent-...,$825pw / $3585pcm,"3 Monterey Crescent, Donvale VIC 3111",House,-37.792402,145.174323,3.0,1.0,5.0,3585.0,...,Melbourne - Outer East,2GMEL,Greater Melbourne,2,Victoria,AUS,Australia,20.8028,http://linked.data.gov.au/dataset/asgsed3/SA2/...,POINT (988479.464 -4293860.035)
4,https://www.domain.com.au/3-49-leslie-street-d...,$680.00,"3/49 Leslie Street, Donvale VIC 3111",Townhouse,-37.781012,145.180705,3.0,2.0,2.0,2955.0,...,Melbourne - Outer East,2GMEL,Greater Melbourne,2,Victoria,AUS,Australia,20.8028,http://linked.data.gov.au/dataset/asgsed3/SA2/...,POINT (989155.458 -4292643.926)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9823,https://www.domain.com.au/brighton-vic-3186-96...,Fine Furnished Residences @ Westprecinct,Brighton VIC 3186,Apartment / Unit / Flat,-37.904400,144.999740,3.0,2.0,2.0,,...,Melbourne - Inner South,2GMEL,Greater Melbourne,2,Victoria,AUS,Australia,8.2004,http://linked.data.gov.au/dataset/asgsed3/SA2/...,POINT (971982.558 -4304953.329)
9824,https://www.domain.com.au/brighton-vic-3186-10...,Fine Furnished Residences @ Westprecinct,Brighton VIC 3186,Townhouse,-37.904400,144.999740,3.0,2.0,2.0,,...,Melbourne - Inner South,2GMEL,Greater Melbourne,2,Victoria,AUS,Australia,8.2004,http://linked.data.gov.au/dataset/asgsed3/SA2/...,POINT (971982.558 -4304953.329)
9825,https://www.domain.com.au/brighton-vic-3186-92...,Fine Furnished Residences @ Westprecinct,Brighton VIC 3186,Apartment / Unit / Flat,-37.904400,144.999740,3.0,2.0,2.0,,...,Melbourne - Inner South,2GMEL,Greater Melbourne,2,Victoria,AUS,Australia,8.2004,http://linked.data.gov.au/dataset/asgsed3/SA2/...,POINT (971982.558 -4304953.329)
9826,https://www.domain.com.au/brighton-vic-3186-91...,Fine Furnished Residences @ Westprecinct,Brighton VIC 3186,Apartment / Unit / Flat,-37.904400,144.999740,2.0,2.0,1.0,,...,Melbourne - Inner South,2GMEL,Greater Melbourne,2,Victoria,AUS,Australia,8.2004,http://linked.data.gov.au/dataset/asgsed3/SA2/...,POINT (971982.558 -4304953.329)


In [121]:
# Save as parquet
gdf.to_csv("../data/raw/domain.csv")