In [1]:
import json
from pathlib import Path

# Load Sydney suburbs list
with open('data/sydney_burbs.json', 'r') as f:
    sydney_data = json.load(f)
    
sydney_suburbs = set(suburb.upper() for suburb in sydney_data['suburbs'])
print(f"Loaded {len(sydney_suburbs)} Sydney suburbs")

# Display first few suburbs
print("\nFirst 10 Sydney suburbs:")
for i, suburb in enumerate(list(sydney_suburbs)[:10]):
    print(f"  {suburb}")


Loaded 660 Sydney suburbs

First 10 Sydney suburbs:
  HILLSDALE
  PADDINGTON
  DRUMMOYNE
  WOODBINE
  COMO
  SEAFORTH
  FRENCHS FOREST
  ELIZABETH HILLS
  SPRING FARM
  GYMEA


In [2]:
# Load the full NSW GeoJSON file
print("Loading NSW GeoJSON file...")
with open('data/nsw-administrative-boundaries-theme-suburb.geojson', 'r') as f:
    nsw_geojson = json.load(f)

print(f"Total features in NSW GeoJSON: {len(nsw_geojson['features'])}")
print(f"GeoJSON type: {nsw_geojson['type']}")


Loading NSW GeoJSON file...
Total features in NSW GeoJSON: 5188
GeoJSON type: FeatureCollection


In [3]:
# Check the structure of a feature to understand property names
if nsw_geojson['features']:
    sample_feature = nsw_geojson['features'][0]
    print("Sample feature properties:")
    print(json.dumps(sample_feature['properties'], indent=2))
    
    # Check what property contains suburb name
    props = sample_feature['properties']
    print("\nPossible suburb name fields:")
    for key in props.keys():
        if 'suburb' in key.lower() or 'name' in key.lower():
            print(f"  {key}: {props[key]}")


Sample feature properties:
{
  "lganame": [
    "CENTRAL COAST"
  ],
  "councilnam": [
    "CENTRAL COAST COUNCIL"
  ],
  "rid": 25,
  "cadid": 108029559,
  "createdate": "1995-07-28T12:00:00+10:00",
  "modifieddate": "2015-01-21T22:11:22+11:00",
  "suburbname": "Box Head",
  "postcode": "2257",
  "state": 2,
  "startdate": "2015-01-21T22:12:12+11:00",
  "enddate": "3000-01-01T12:00:00+11:00",
  "lastupdate": "2015-01-21T22:12:31.110000+11:00",
  "msoid": 25,
  "centroidid": null,
  "shapeuuid": "94228ccc-c062-3839-ae8d-b6aa02e4e6a5",
  "changetype": "M",
  "processstate": null,
  "urbanity": "U",
  "shape_length": 24493.945040378658,
  "shape_area": 21615301.238526534,
  "geo_point_2d": {
    "lon": 151.3268987955763,
    "lat": -33.54572260706558
  },
  "lga_name": "Central Coast",
  "lga_suburb": "Central Coast-Box Head",
  "suburb": "BOX HEAD"
}

Possible suburb name fields:
  lganame: ['CENTRAL COAST']
  suburbname: Box Head
  lga_name: Central Coast
  lga_suburb: Central Coast-Bo

In [4]:
# Filter features to only include Sydney suburbs
# The property name appears to be 'suburbname' based on the sample output
sydney_features = []
not_found_suburbs = set()
found_suburbs = set()

for feature in nsw_geojson['features']:
    props = feature['properties']
    
    # Try different possible property names for suburb
    suburb_name = None
    if 'suburbname' in props:
        suburb_name = props['suburbname']
    elif 'suburb' in props:
        suburb_name = props['suburb']
    elif 'SUBURB' in props:
        suburb_name = props['SUBURB']
    
    if suburb_name:
        suburb_upper = suburb_name.upper().strip()
        if suburb_upper in sydney_suburbs:
            sydney_features.append(feature)
            found_suburbs.add(suburb_upper)
        else:
            not_found_suburbs.add(suburb_upper)

print(f"Found {len(sydney_features)} Sydney suburb features")
print(f"Unique Sydney suburbs found: {len(found_suburbs)}")
print(f"\nFirst 10 found suburbs:")
for i, suburb in enumerate(list(found_suburbs)[:10]):
    print(f"  {suburb}")


Found 814 Sydney suburb features
Unique Sydney suburbs found: 660

First 10 found suburbs:
  HILLSDALE
  PADDINGTON
  DRUMMOYNE
  WOODBINE
  COMO
  SEAFORTH
  FRENCHS FOREST
  ELIZABETH HILLS
  SPRING FARM
  GYMEA


In [5]:
# Check which Sydney suburbs from our list were not found
missing_suburbs = sydney_suburbs - found_suburbs
if missing_suburbs:
    print(f"\n{len(missing_suburbs)} Sydney suburbs from our list were not found in the GeoJSON:")
    print("First 20 missing suburbs:")
    for i, suburb in enumerate(list(missing_suburbs)[:20]):
        print(f"  {suburb}")
else:
    print("\nAll Sydney suburbs from our list were found!")



All Sydney suburbs from our list were found!


In [6]:
# Normalize suburb property names for consistent matching
# Add a normalized 'suburb' property (uppercase) to each feature
normalized_features = []
for feature in sydney_features:
    # Create a copy of the feature
    normalized_feature = json.loads(json.dumps(feature))
    props = normalized_feature['properties']
    
    # Extract suburb name from various possible property names
    suburb_name = None
    if 'suburbname' in props:
        suburb_name = props['suburbname']
    elif 'suburb' in props:
        suburb_name = props['suburb']
    elif 'SUBURB' in props:
        suburb_name = props['SUBURB']
    
    # Normalize to uppercase and add as 'suburb' property
    if suburb_name:
        normalized_suburb = str(suburb_name).upper().strip()
        props['suburb'] = normalized_suburb
        # Also keep original properties for reference
        normalized_features.append(normalized_feature)

# Create filtered GeoJSON with normalized properties
sydney_geojson = {
    "type": "FeatureCollection",
    "features": normalized_features
}

# Save the filtered GeoJSON
output_path = 'data/sydney_suburbs.geojson'
with open(output_path, 'w') as f:
    json.dump(sydney_geojson, f)

print(f"Saved filtered GeoJSON to {output_path}")
print(f"Total features: {len(sydney_geojson['features'])}")
print(f"\nSample normalized feature properties:")
if normalized_features:
    sample_props = normalized_features[0]['properties']
    print(f"  suburb: {sample_props.get('suburb')}")
    print(f"  suburbname: {sample_props.get('suburbname')}")


Saved filtered GeoJSON to data/sydney_suburbs.geojson
Total features: 814

Sample normalized feature properties:
  suburb: COASTERS RETREAT
  suburbname: Coasters Retreat
