# Data filter (NSW -> Sydney only)

#! So we have 2005-2025. We want to filter this down to just Sydney. We could do so with a suburb / district code list, or preferably a range (if sydney is say between 1000 and 3000 ya know.)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings

warnings.filterwarnings('ignore')

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

DATA_DIR = "../data"
OUTPUT_DIR = "../data/parquet"
YEARS = range(2005, 2026)

Libraries imported successfully!


In [None]:
# loading all the parquet files
print("Loading parquet files...")

all_data = []
for year in YEARS:
    parquet_file = f"{OUTPUT_DIR}/{year}.parquet"
    if Path(parquet_file).exists():
        try:
            df = pd.read_parquet(parquet_file, engine='fastparquet')
        except:
            df = pd.read_parquet(parquet_file)
        df['year'] = year
        all_data.append(df)
        print(f"  Loaded {year}: {len(df):,} records")
    else:
        print(f"  Skipped {year}: file not found")

# Combine all years
if all_data:
    df_all = pd.concat(all_data, ignore_index=True)
    print(f"\nTotal records: {len(df_all):,}")
    print(f"Date range: {df_all['settlement_date'].min()} to {df_all['settlement_date'].max()}")
    print(f"\nDataFrame shape: {df_all.shape}")
    print(f"\nColumns: {list(df_all.columns)}")
else:
    print("No data files found!")

Loading parquet files...
  Loaded 2005: 181,502 records
  Loaded 2006: 182,983 records
  Loaded 2007: 209,360 records
  Loaded 2008: 191,223 records
  Loaded 2009: 200,013 records
  Loaded 2010: 181,513 records
  Loaded 2011: 172,995 records
  Loaded 2012: 172,434 records
  Loaded 2013: 197,363 records
  Loaded 2014: 227,113 records
  Loaded 2015: 224,136 records
  Loaded 2016: 268,475 records
  Loaded 2017: 218,506 records
  Loaded 2018: 211,521 records
  Loaded 2019: 9,835 records
  Loaded 2020: 191,854 records
  Loaded 2021: 237,483 records
  Loaded 2022: 195,378 records
  Loaded 2023: 184,866 records
  Loaded 2024: 214,931 records
  Loaded 2025: 207,571 records

Total records: 4,081,055
Date range: 2005-01-01 00:00:00 to 2025-11-20 00:00:00

DataFrame shape: (4081055, 27)

Columns: ['record_type', 'district_code', 'property_id', 'sale_counter', 'download_timestamp', 'property_name', 'property_unit_number', 'property_house_number', 'property_street_name', 'property_locality', 'prope

In [None]:
# checking what we've got - districts, suburbs, postcodes
print("Summary of Sydney data:")

all_district_codes = df_all['district_code'].unique()
all_suburbs = df_all['property_locality'].unique()
all_postcodes = df_all['property_post_code'].unique()
all_nature_of_properties = df_all['nature_of_property'].unique()

print(f"Total unique district codes: {len(all_district_codes)}")
print(f"Total unique suburbs: {len(all_suburbs)}")
print(f"Total unique postcodes: {len(all_postcodes)}")
print(f"Total unique nature of properties: {len(all_nature_of_properties)}")


# take a look at some of the instances of these lists.
print(f"Total unique districts: {len(all_district_codes)}")
print(all_district_codes[0:10])
print(f"Total unique suburbs: {len(all_suburbs)}")
print(all_suburbs[0:10])
print(f"Total unique postcodes: {len(all_postcodes)}")
print(all_postcodes[0:10])
print(f"Total unique nature of properties: {len(all_nature_of_properties)}")
print(all_nature_of_properties[0:10])

Summary of Sydney data:
Total unique district codes: 181
Total unique suburbs: 5224
Total unique postcodes: 666
Total unique nature of properties: 3
Total unique districts: 181
['187' '206' '575' '103' '207' '081' '224' '014' '171' '216']
Total unique suburbs: 5224
['GUNNEDAH' 'MARRICKVILLE' 'WAGGA WAGGA' 'KEIRAVILLE' 'SOUTH COOGEE'
 'ANNANGROVE' 'PENRITH' 'BATEAU BAY' 'TWEED HEADS' 'BLACKHEATH']
Total unique postcodes: 666
['2386' '2204' '2650' '2500' '2034' '2156' '2750' '2261' '2485' '2785']
Total unique nature of properties: 3
['3' 'R' 'V']


In [None]:
#! okay, so with postcodes, sydney has a range. From 2000 to 2234 covers Sydney's metro area. But unfortunately it seems that greater sydney's postcodes are broken up between 2000s, 2100s, and parts of 2500s and 2700s.

#! I chose to just grab a list of all Sydney suburbs (in ./data/sydney_burbs.json) and use that to filter.

import json

# JSON format: { "suburbs": [<suburb_name>, ...] }
static_data = json.load(open('./data/sydney_burbs.json'))
sydney_suburbs = static_data['suburbs']

# lowercase for matching
sydney_suburbs = [sub.lower() for sub in sydney_suburbs]

print(f"Total unique suburbs: {len(sydney_suburbs)}")
print(sydney_suburbs[0:5])
print()

# filter to Sydney suburbs only
df_sydney = df_all[df_all['property_locality'].str.lower().isin(sydney_suburbs)].copy()

# only residential properties
df_sydney = df_sydney[df_sydney['nature_of_property'] == 'R']

print(f"\nFiltered Sydney data summary:")
print(f"Total records: {len(df_sydney):,}")
print(f"Percentage of original data: {len(df_sydney)/len(df_all)*100:.2f}%")
print(f"Date range: {df_sydney['settlement_date'].min()} to {df_sydney['settlement_date'].max()}")
print(f"\nUnique Sydney suburbs in filtered data: {df_sydney['property_locality'].nunique()}")
print(f"Unique postcodes: {df_sydney['property_post_code'].nunique()}")
print(f"Unique district codes: {df_sydney['district_code'].nunique()}")

print("\nSample of filtered Sydney data:")
print(df_sydney.head())



Total unique suburbs: 660
['abbotsbury', 'abbotsford', 'acacia gardens', 'agnes banks', 'airds']


Filtered Sydney data summary:
Total records: 1,932,053
Percentage of original data: 47.34%
Date range: 2005-01-01 00:00:00 to 2025-11-20 00:00:00

Unique Sydney suburbs in filtered data: 652
Unique postcodes: 248
Unique district codes: 62

Sample of filtered Sydney data:
   record_type district_code property_id sale_counter download_timestamp  \
5            B           207     2004828           56     20050216 14:07   
6            B           081      595321           26     20050216 13:58   
12           B           087      766674           31     20050301 09:35   
14           B           144     1589880           37     20050216 14:04   
15           B           210     2103704           77     20050201 15:04   

   property_name property_unit_number property_house_number  \
5                                   3                   165   
6                                             

In [None]:
import os
sydney_output_dir = "../data/sydney"
os.makedirs(sydney_output_dir, exist_ok=True)

# save to parquet
output_file = f"{sydney_output_dir}/full.parquet"
df_sydney.to_parquet(output_file, engine='fastparquet', index=False)
print(f"\nSydney data saved to: {output_file}")
print(f"File size: {os.path.getsize(output_file) / (1024**2):.2f} MB")



Sydney data saved to: ../data/sydney/full.parquet
File size: 216.34 MB


In [None]:
# figuring out how to split houses vs units
print("="*60)
print("INSPECTING DATA FOR HOUSE/UNIT SPLIT")
print("="*60)

print("\n1. Property Unit Number Analysis:")
print(f"   Total records: {len(df_sydney):,}")
print(f"   Records with unit number (non-null, non-empty): {df_sydney['property_unit_number'].notna().sum():,}")
print(f"   Records with empty unit number: {(df_sydney['property_unit_number'].isna() | (df_sydney['property_unit_number'] == '')).sum():,}")

# Sample of records with unit numbers
print("\n   Sample records WITH unit numbers:")
units_sample = df_sydney[df_sydney['property_unit_number'].notna() & (df_sydney['property_unit_number'] != '')].head(3)
for idx, row in units_sample.iterrows():
    print(f"     - Unit: {row['property_unit_number']}, Street: {row['property_street_name']}, Suburb: {row['property_locality']}")

# Sample of records without unit numbers
print("\n   Sample records WITHOUT unit numbers:")
houses_sample = df_sydney[(df_sydney['property_unit_number'].isna()) | (df_sydney['property_unit_number'] == '')].head(3)
for idx, row in houses_sample.iterrows():
    print(f"     - House: {row['property_house_number']}, Street: {row['property_street_name']}, Suburb: {row['property_locality']}")

# Check strata_lot_number column
print("\n2. Strata Lot Number Analysis:")
print(f"   Records with strata lot number (non-null, non-empty): {df_sydney['strata_lot_number'].notna().sum():,}")
print(f"   Records without strata lot number: {(df_sydney['strata_lot_number'].isna() | (df_sydney['strata_lot_number'] == '')).sum():,}")

# Check primary_purpose column
print("\n3. Primary Purpose Analysis:")
print(df_sydney['primary_purpose'].value_counts())

# Check if there's overlap between unit_number and strata_lot_number
print("\n4. Overlap Analysis:")
has_unit_num = df_sydney['property_unit_number'].notna() & (df_sydney['property_unit_number'] != '')
has_strata = df_sydney['strata_lot_number'].notna() & (df_sydney['strata_lot_number'] != '')
print(f"   Records with unit number: {has_unit_num.sum():,}")
print(f"   Records with strata lot: {has_strata.sum():,}")
print(f"   Records with both: {(has_unit_num & has_strata).sum():,}")
print(f"   Records with either: {(has_unit_num | has_strata).sum():,}")

# Define split logic: Units have unit_number OR strata_lot_number, Houses have neither
print("\n5. Proposed Split Logic:")
units_mask = has_unit_num | has_strata
houses_mask = ~units_mask
print(f"   Units (has unit_number OR strata_lot_number): {units_mask.sum():,}")
print(f"   Houses (has neither): {houses_mask.sum():,}")
print(f"   Total check: {units_mask.sum() + houses_mask.sum():,} (should equal {len(df_sydney):,})")


INSPECTING DATA FOR HOUSE/UNIT SPLIT

1. Property Unit Number Analysis:
   Total records: 1,932,053
   Records with unit number (non-null, non-empty): 1,932,053
   Records with empty unit number: 988,270

   Sample records WITH unit numbers:
     - Unit: 3, Street: MALABAR RD, Suburb: SOUTH COOGEE
     - Unit: 6, Street: GLANDORE ST, Suburb: WOOLOOWARE
     - Unit: 415, Street: JAQUES AVE, Suburb: BONDI BEACH

   Sample records WITHOUT unit numbers:
     - House: 16, Street: HILL CLIMB DR, Suburb: ANNANGROVE
     - House: 7, Street: SUPERBA PDE, Suburb: MOSMAN
     - House: 14, Street: WARREN RD, Suburb: EDGECLIFF

2. Strata Lot Number Analysis:
   Records with strata lot number (non-null, non-empty): 1,932,053
   Records without strata lot number: 878,029

3. Primary Purpose Analysis:
primary_purpose
RESIDENCE    1932053
Name: count, dtype: int64

4. Overlap Analysis:
   Records with unit number: 943,783
   Records with strata lot: 1,054,024
   Records with both: 939,632
   Records wi

In [None]:
# splitting houses and units
print("="*60)
print("SPLITTING HOUSES AND UNITS")
print("="*60)

# convert to strings to handle NaN/empty properly
unit_num_series = df_sydney['property_unit_number'].astype(str)
strata_series = df_sydney['strata_lot_number'].astype(str)

# units have unit_number OR strata_lot_number (excluding 'nan' and empty)
has_unit_num = (unit_num_series != 'nan') & (unit_num_series != '') & (unit_num_series.notna())
has_strata = (strata_series != 'nan') & (strata_series != '') & (strata_series.notna())
units_mask = has_unit_num | has_strata
houses_mask = ~units_mask

full_units = df_sydney[units_mask].copy()
full_houses = df_sydney[houses_mask].copy()

print(f"\nSplit results:")
print(f"  Units: {len(full_units):,} records ({len(full_units)/len(df_sydney)*100:.2f}%)")
print(f"  Houses: {len(full_houses):,} records ({len(full_houses)/len(df_sydney)*100:.2f}%)")
print(f"  Total: {len(full_units) + len(full_houses):,} records (should equal {len(df_sydney):,})")

if len(full_units) + len(full_houses) != len(df_sydney):
    print(f"\n⚠️  WARNING: Split doesn't match total records!")
else:
    print(f"\n✓ Split verification passed!")

print(f"\nSample unit record:")
if len(full_units) > 0:
    sample_unit = full_units.iloc[0]
    print(f"  Unit: {sample_unit['property_unit_number']}, Strata: {sample_unit['strata_lot_number']}")
    print(f"  Address: {sample_unit['property_house_number']} {sample_unit['property_street_name']}, {sample_unit['property_locality']}")

print(f"\nSample house record:")
if len(full_houses) > 0:
    sample_house = full_houses.iloc[0]
    print(f"  Unit: {sample_house['property_unit_number']}, Strata: {sample_house['strata_lot_number']}")
    print(f"  Address: {sample_house['property_house_number']} {sample_house['property_street_name']}, {sample_house['property_locality']}")

import os

sydney_output_dir = "../data/sydney"
os.makedirs(sydney_output_dir, exist_ok=True)

units_file = f"{sydney_output_dir}/full_units.parquet"
full_units.to_parquet(units_file, engine='fastparquet', index=False)
print(f"\nSaved units data to: {units_file}")
print(f"  Records: {len(full_units):,}")
print(f"  File size: {os.path.getsize(units_file) / (1024**2):.2f} MB")

houses_file = f"{sydney_output_dir}/full_houses.parquet"
full_houses.to_parquet(houses_file, engine='fastparquet', index=False)
print(f"\nSaved houses data to: {houses_file}")
print(f"  Records: {len(full_houses):,}")
print(f"  File size: {os.path.getsize(houses_file) / (1024**2):.2f} MB")

print("\n" + "="*60)
print("DATA SPLIT COMPLETE")
print("="*60)
print(f"\nFiles created:")
print(f"  1. {units_file}")
print(f"  2. {houses_file}")
print(f"\nOriginal file: {sydney_output_dir}/full.parquet")


SPLITTING HOUSES AND UNITS

Split results:
  Units: 1,058,175 records (54.77%)
  Houses: 873,878 records (45.23%)
  Total: 1,932,053 records (should equal 1,932,053)

 Split verification passed!

Sample unit record:
  Unit: 3, Strata: 3
  Address: 165 MALABAR RD, SOUTH COOGEE

Sample house record:
  Unit: , Strata: 
  Address: 16 HILL CLIMB DR, ANNANGROVE

 Saved units data to: ../data/sydney/full_units.parquet
  Records: 1,058,175
  File size: 103.28 MB

 Saved houses data to: ../data/sydney/full_houses.parquet
  Records: 873,878
  File size: 102.01 MB

DATA SPLIT COMPLETE

Files created:
  1. ../data/sydney/full_units.parquet
  2. ../data/sydney/full_houses.parquet

Original file: ../data/sydney/full.parquet
