In [1]:
# Debug: Match GeoJSON suburbs with Database suburbs
# This notebook helps identify mismatches between GeoJSON suburb names
# and database suburb names that cause missing suburbs in the frontend map

import json
import sqlite3
from pathlib import Path
from collections import defaultdict
import difflib

print("="*80)
print("GEOJSON vs DATABASE SUBURB MATCHING DEBUG")
print("="*80)


GEOJSON vs DATABASE SUBURB MATCHING DEBUG


In [2]:
# Load GeoJSON file
geojson_path = Path('data/sydney_suburbs.geojson')
print(f"Loading GeoJSON from: {geojson_path.absolute()}")

with open(geojson_path, 'r') as f:
    geojson_data = json.load(f)

# Extract all suburb names from GeoJSON
geojson_suburbs = set()
geojson_suburb_details = {}  # Store original names for debugging

for feature in geojson_data['features']:
    props = feature['properties']
    
    # Try to get suburb name (same logic as frontend getSuburbName)
    suburb_name = None
    if 'suburb' in props:
        suburb_name = props['suburb']
    elif 'suburbname' in props:
        suburb_name = props['suburbname']
    elif 'SUBURB' in props:
        suburb_name = props['SUBURB']
    
    if suburb_name:
        suburb_upper = str(suburb_name).strip().upper()
        geojson_suburbs.add(suburb_upper)
        geojson_suburb_details[suburb_upper] = {
            'original': suburb_name,
            'suburb_prop': props.get('suburb'),
            'suburbname_prop': props.get('suburbname'),
        }

print(f"\nFound {len(geojson_suburbs)} unique suburbs in GeoJSON")
print(f"Total features: {len(geojson_data['features'])}")
print(f"\nFirst 10 GeoJSON suburbs:")
for i, suburb in enumerate(sorted(list(geojson_suburbs))[:10]):
    print(f"  {i+1}. {suburb}")


Loading GeoJSON from: /home/korij/development/web/business/housing_affordability_crisis/final/backend/notebooks/data/sydney_suburbs.geojson

Found 660 unique suburbs in GeoJSON
Total features: 814

First 10 GeoJSON suburbs:
  1. ABBOTSBURY
  2. ABBOTSFORD
  3. ACACIA GARDENS
  4. AGNES BANKS
  5. AIRDS
  6. ALEXANDRIA
  7. ALFORDS POINT
  8. ALLAMBIE HEIGHTS
  9. ALLAWAH
  10. AMBARVALE


In [3]:
# Connect to database and get all suburbs
db_path = Path('../src/db/database.sqlite')
print(f"Connecting to database: {db_path.absolute()}")

# Initialize variables
db_suburbs = set()
db_suburbs_raw = []

if not db_path.exists():
    print(f"ERROR: Database not found at {db_path}")
    print("Please check the database path.")
else:
    conn = sqlite3.connect(str(db_path))
    cursor = conn.cursor()
    
    # Get all unique suburbs from suburb_analytics table
    cursor.execute("SELECT DISTINCT suburb FROM suburb_analytics ORDER BY suburb")
    db_suburbs_raw = [row[0] for row in cursor.fetchall()]
    
    # Normalize to uppercase (same as frontend)
    db_suburbs = set(suburb.upper().strip() for suburb in db_suburbs_raw)
    
    print(f"\nFound {len(db_suburbs)} unique suburbs in database")
    print(f"\nFirst 10 database suburbs:")
    for i, suburb in enumerate(sorted(list(db_suburbs))[:10]):
        print(f"  {i+1}. {suburb}")
    
    conn.close()


Connecting to database: /home/korij/development/web/business/housing_affordability_crisis/final/backend/notebooks/../src/db/database.sqlite

Found 645 unique suburbs in database

First 10 database suburbs:
  1. ABBOTSBURY
  2. ABBOTSFORD
  3. ACACIA GARDENS
  4. AGNES BANKS
  5. AIRDS
  6. ALEXANDRIA
  7. ALFORDS POINT
  8. ALLAMBIE HEIGHTS
  9. ALLAWAH
  10. AMBARVALE


In [4]:
# Compare GeoJSON suburbs with database suburbs
print("="*80)
print("COMPARISON RESULTS")
print("="*80)

# Suburbs in GeoJSON but not in database
geojson_only = geojson_suburbs - db_suburbs
print(f"\n1. Suburbs in GeoJSON but NOT in database: {len(geojson_only)}")
if geojson_only:
    print("   These suburbs will appear on the map but have no data (gray)")
    print("   First 20 missing suburbs:")
    for i, suburb in enumerate(sorted(list(geojson_only))[:20]):
        print(f"     {i+1}. {suburb}")
    if len(geojson_only) > 20:
        print(f"     ... and {len(geojson_only) - 20} more")

# Suburbs in database but not in GeoJSON
db_only = db_suburbs - geojson_suburbs
print(f"\n2. Suburbs in database but NOT in GeoJSON: {len(db_only)}")
if db_only:
    print("   These suburbs have data but won't appear on the map!")
    print("   First 20 missing suburbs:")
    for i, suburb in enumerate(sorted(list(db_only))[:20]):
        print(f"     {i+1}. {suburb}")
    if len(db_only) > 20:
        print(f"     ... and {len(db_only) - 20} more")

# Suburbs in both (matched)
matched = geojson_suburbs & db_suburbs
print(f"\n3. Suburbs in BOTH GeoJSON and database: {len(matched)}")
print(f"   These will display correctly on the map with data")

# Match percentage
total_geojson = len(geojson_suburbs)
match_percentage = (len(matched) / total_geojson * 100) if total_geojson > 0 else 0
print(f"\n4. Match rate: {len(matched)}/{total_geojson} ({match_percentage:.1f}%)")


COMPARISON RESULTS

1. Suburbs in GeoJSON but NOT in database: 15
   These suburbs will appear on the map but have no data (gray)
   First 20 missing suburbs:
     1. BADGERYS CREEK
     2. BANKSTOWN AERODROME
     3. CAMELLIA
     4. CURRAWONG BEACH
     5. KU-RING-GAI CHASE
     6. LEN WATERS ESTATE
     7. LUCAS HEIGHTS
     8. MILSONS PASSAGE
     9. MOORE PARK
     10. MORNING BAY
     11. PORT BOTANY
     12. RICHARDS
     13. ROOKWOOD
     14. SINGLETONS MILL
     15. VARROVILLE

2. Suburbs in database but NOT in GeoJSON: 0

3. Suburbs in BOTH GeoJSON and database: 645
   These will display correctly on the map with data

4. Match rate: 645/660 (97.7%)


In [5]:
# Try to find potential matches using fuzzy matching
# This helps identify suburbs that might be the same but with different names
print("="*80)
print("FUZZY MATCHING ANALYSIS")
print("="*80)

if db_only:
    print(f"\nTrying to find potential matches for {len(db_only)} database suburbs not in GeoJSON:")
    print("(Using similarity threshold of 0.8)")
    
    potential_matches = []
    for db_suburb in sorted(list(db_only))[:50]:  # Limit to first 50 for performance
        best_match = None
        best_ratio = 0
        
        for geojson_suburb in geojson_suburbs:
            ratio = difflib.SequenceMatcher(None, db_suburb, geojson_suburb).ratio()
            if ratio > best_ratio:
                best_ratio = ratio
                best_match = geojson_suburb
        
        if best_ratio >= 0.8:
            potential_matches.append({
                'db_suburb': db_suburb,
                'geojson_suburb': best_match,
                'similarity': best_ratio
            })
    
    if potential_matches:
        print(f"\nFound {len(potential_matches)} potential matches:")
        for match in potential_matches[:20]:
            print(f"  DB: '{match['db_suburb']}' <-> GeoJSON: '{match['geojson_suburb']}' (similarity: {match['similarity']:.2f})")
    else:
        print("\nNo close matches found (similarity < 0.8)")
else:
    print("\nNo database-only suburbs to match")


FUZZY MATCHING ANALYSIS

No database-only suburbs to match


In [6]:
# Check for common naming issues
print("="*80)
print("COMMON NAMING ISSUES ANALYSIS")
print("="*80)

if not db_path.exists():
    print("Database not found. Skipping naming issues analysis.")
else:
    # Check for suburbs with different casing variations
    conn = sqlite3.connect(str(db_path))
    cursor = conn.cursor()

    # Get all suburbs with their original casing
    cursor.execute("SELECT DISTINCT suburb FROM suburb_analytics ORDER BY suburb")
    db_suburbs_original = [row[0] for row in cursor.fetchall()]

    # Check for suburbs that might have casing issues
    casing_variations = defaultdict(list)
    for db_suburb in db_suburbs_original:
        upper = db_suburb.upper().strip()
        casing_variations[upper].append(db_suburb)

    # Find suburbs with multiple casing variations
    multiple_casings = {k: v for k, v in casing_variations.items() if len(v) > 1}
    if multiple_casings:
        print(f"\nFound {len(multiple_casings)} suburbs with multiple casing variations:")
        for upper, variations in list(multiple_casings.items())[:10]:
            print(f"  {upper}: {variations}")

    # Check for common prefixes/suffixes that might differ
    print("\nChecking for common naming patterns...")

    # Check for "THE" prefix differences
    db_with_the = [s for s in db_suburbs if s.startswith('THE ')]
    geojson_with_the = [s for s in geojson_suburbs if s.startswith('THE ')]
    if db_with_the or geojson_with_the:
        print(f"\nSuburbs with 'THE' prefix:")
        print(f"  Database: {len(db_with_the)}")
        print(f"  GeoJSON: {len(geojson_with_the)}")

    # Check for apostrophes
    db_with_apostrophe = [s for s in db_suburbs if "'" in s]
    geojson_with_apostrophe = [s for s in geojson_suburbs if "'" in s]
    if db_with_apostrophe or geojson_with_apostrophe:
        print(f"\nSuburbs with apostrophes:")
        print(f"  Database: {len(db_with_apostrophe)}")
        print(f"  GeoJSON: {len(geojson_with_apostrophe)}")
        if db_with_apostrophe[:5]:
            print(f"  Examples (DB): {db_with_apostrophe[:5]}")
        if geojson_with_apostrophe[:5]:
            print(f"  Examples (GeoJSON): {geojson_with_apostrophe[:5]}")

    conn.close()


COMMON NAMING ISSUES ANALYSIS

Checking for common naming patterns...

Suburbs with 'THE' prefix:
  Database: 2
  GeoJSON: 2


In [7]:
# Detailed analysis: Check specific examples of mismatches
print("="*80)
print("DETAILED MISMATCH ANALYSIS")
print("="*80)

if not db_path.exists():
    print("Database not found. Skipping detailed mismatch analysis.")
elif db_only:
    print(f"\nAnalyzing {min(10, len(db_only))} database suburbs not found in GeoJSON:")
    
    conn = sqlite3.connect(str(db_path))
    cursor = conn.cursor()
    
    for i, db_suburb in enumerate(sorted(list(db_only))[:10]):
        print(f"\n{i+1}. Database suburb: '{db_suburb}'")
        
        # Get sample data for this suburb
        cursor.execute("""
            SELECT property_type, current_median_price, current_num_sales 
            FROM suburb_analytics 
            WHERE UPPER(TRIM(suburb)) = ?
            LIMIT 2
        """, (db_suburb,))
        
        rows = cursor.fetchall()
        if rows:
            print(f"   Has data: {len(rows)} property type(s)")
            for row in rows:
                print(f"     - {row[0]}: ${row[1]:,.0f} median, {row[2]} sales")
        
        # Try to find similar names in GeoJSON
        similar = []
        for geojson_suburb in geojson_suburbs:
            ratio = difflib.SequenceMatcher(None, db_suburb, geojson_suburb).ratio()
            if ratio >= 0.7:
                similar.append((geojson_suburb, ratio))
        
        if similar:
            similar.sort(key=lambda x: x[1], reverse=True)
            print(f"   Similar GeoJSON suburbs:")
            for name, ratio in similar[:3]:
                print(f"     - '{name}' (similarity: {ratio:.2f})")
        else:
            print(f"   No similar suburbs found in GeoJSON")
    
    conn.close()
else:
    print("\nNo database-only suburbs found - all database suburbs are in GeoJSON!")


DETAILED MISMATCH ANALYSIS

No database-only suburbs found - all database suburbs are in GeoJSON!


In [None]:
# Summary and recommendations
print("="*80)
print("SUMMARY AND RECOMMENDATIONS")
print("="*80)

print(f"\nTotal GeoJSON suburbs: {len(geojson_suburbs)}")
print(f"Total Database suburbs: {len(db_suburbs)}")
print(f"Matched suburbs: {len(matched)}")
print(f"GeoJSON-only suburbs: {len(geojson_only)}")
print(f"Database-only suburbs: {len(db_only)}")

if len(db_only) > 0:
    print(f"\n⚠️  ISSUE FOUND: {len(db_only)} suburbs have data but won't appear on the map!")
    print("\nPossible causes:")
    print("  1. Suburb names in database don't match GeoJSON suburb names")
    print("  2. Casing differences (though both are normalized to uppercase)")
    print("  3. Special characters or punctuation differences")
    print("  4. Suburbs renamed or merged in GeoJSON data")
    print("\nRecommendations:")
    print("  1. Check the fuzzy matching results above for potential fixes")
    print("  2. Verify suburb names in database match GeoJSON exactly")
    print("  3. Consider normalizing suburb names in database to match GeoJSON")
    print("  4. Update GeoJSON if database has correct suburb names")
else:
    print("\n✅ All database suburbs are present in GeoJSON!")

if len(geojson_only) > 0:
    print(f"\nℹ️  Note: {len(geojson_only)} suburbs are in GeoJSON but not in database.")
    print("   These will appear on the map but show as gray (no data).")
    print("   This is expected if some suburbs don't have sales data.")

print("\n" + "="*80)


In [8]:
# Specific search for VAUCLUSE and POINT PIPER
print("="*80)
print("SPECIFIC SUBURB SEARCH: VAUCLUSE and POINT PIPER")
print("="*80)

target_suburbs = ['VAUCLUSE', 'POINT PIPER', 'VAUCLUSE', 'POINTPIPER', 'POINT PIPPER']
# Also check variations
target_suburbs_lower = [s.lower() for s in target_suburbs]
target_suburbs_mixed = ['Vaucluse', 'Point Piper', 'PointPiper']

print("\n1. Searching in GeoJSON...")
geojson_found = {}
for suburb_variant in target_suburbs + target_suburbs_lower + target_suburbs_mixed:
    suburb_upper = suburb_variant.upper().strip()
    if suburb_upper in geojson_suburbs:
        geojson_found[suburb_upper] = True
        # Get details
        for feature in geojson_data['features']:
            props = feature['properties']
            suburb_name = None
            if 'suburb' in props:
                suburb_name = props['suburb']
            elif 'suburbname' in props:
                suburb_name = props['suburbname']
            
            if suburb_name and str(suburb_name).upper().strip() == suburb_upper:
                print(f"\n   Found '{suburb_upper}' in GeoJSON:")
                print(f"     - suburb property: {props.get('suburb')}")
                print(f"     - suburbname property: {props.get('suburbname')}")
                print(f"     - All properties with 'suburb' in name:")
                for key in props.keys():
                    if 'suburb' in key.lower():
                        print(f"       {key}: {props[key]}")
                break

if not geojson_found:
    print("   ❌ Neither VAUCLUSE nor POINT PIPER found in GeoJSON!")
    # Try fuzzy search
    print("\n   Trying fuzzy search in GeoJSON...")
    for target in ['VAUCLUSE', 'POINT PIPER']:
        best_match = None
        best_ratio = 0
        for geojson_suburb in geojson_suburbs:
            ratio = difflib.SequenceMatcher(None, target, geojson_suburb).ratio()
            if ratio > best_ratio:
                best_ratio = ratio
                best_match = geojson_suburb
        if best_ratio > 0.7:
            print(f"   '{target}' -> closest match: '{best_match}' (similarity: {best_ratio:.2f})")
else:
    print(f"   ✅ Found {len(geojson_found)} suburb(s) in GeoJSON: {list(geojson_found.keys())}")

print("\n2. Searching in Database...")
if db_path.exists():
    conn = sqlite3.connect(str(db_path))
    cursor = conn.cursor()
    
    db_found = {}
    for suburb_variant in target_suburbs + target_suburbs_lower + target_suburbs_mixed:
        suburb_upper = suburb_variant.upper().strip()
        
        # Try exact match
        cursor.execute("SELECT DISTINCT suburb FROM suburb_analytics WHERE UPPER(TRIM(suburb)) = ?", (suburb_upper,))
        rows = cursor.fetchall()
        if rows:
            db_found[suburb_upper] = rows[0][0]  # Store original casing
            print(f"\n   Found '{suburb_upper}' in database (original: '{rows[0][0]}'):")
            
            # Get analytics data
            cursor.execute("""
                SELECT property_type, current_median_price, current_num_sales, 
                       growth_5yr_percentage, current_avg_ctsd
                FROM suburb_analytics 
                WHERE UPPER(TRIM(suburb)) = ?
            """, (suburb_upper,))
            analytics = cursor.fetchall()
            for row in analytics:
                print(f"     - {row[0]}: ${row[1]:,.0f} median, {row[2]} sales, {row[3]:.1f}% 5yr growth")
    
    if not db_found:
        print("   ❌ Neither VAUCLUSE nor POINT PIPER found in database!")
        # Try fuzzy search
        print("\n   Trying fuzzy search in database...")
        cursor.execute("SELECT DISTINCT suburb FROM suburb_analytics")
        all_db_suburbs = [row[0] for row in cursor.fetchall()]
        
        for target in ['VAUCLUSE', 'POINT PIPER']:
            best_match = None
            best_ratio = 0
            for db_suburb in all_db_suburbs:
                ratio = difflib.SequenceMatcher(None, target, db_suburb.upper()).ratio()
                if ratio > best_ratio:
                    best_ratio = ratio
                    best_match = db_suburb
            if best_ratio > 0.7:
                print(f"   '{target}' -> closest match: '{best_match}' (similarity: {best_ratio:.2f})")
    else:
        print(f"   ✅ Found {len(db_found)} suburb(s) in database: {list(db_found.keys())}")
    
    conn.close()
else:
    print("   Database not found!")

print("\n3. Comparison Summary:")
print("="*80)
for target in ['VAUCLUSE', 'POINT PIPER']:
    target_upper = target.upper()
    in_geojson = target_upper in geojson_suburbs
    in_db = target_upper in db_suburbs if db_path.exists() else False
    
    print(f"\n{target}:")
    print(f"  GeoJSON: {'✅ Found' if in_geojson else '❌ Not found'}")
    print(f"  Database: {'✅ Found' if in_db else '❌ Not found'}")
    
    if in_db and not in_geojson:
        print(f"  ⚠️  ISSUE: Has data but won't appear on map!")
    elif in_geojson and not in_db:
        print(f"  ℹ️  Will appear on map but no data (gray)")
    elif in_db and in_geojson:
        print(f"  ✅ Should display correctly on map")


SPECIFIC SUBURB SEARCH: VAUCLUSE and POINT PIPER

1. Searching in GeoJSON...

   Found 'VAUCLUSE' in GeoJSON:
     - suburb property: VAUCLUSE
     - suburbname property: Vaucluse
     - All properties with 'suburb' in name:
       suburbname: Vaucluse
       lga_suburb: Unincorporated-Vaucluse
       suburb: VAUCLUSE

   Found 'POINT PIPER' in GeoJSON:
     - suburb property: POINT PIPER
     - suburbname property: Point Piper
     - All properties with 'suburb' in name:
       suburbname: Point Piper
       lga_suburb: Woollahra-Point Piper
       suburb: POINT PIPER

   Found 'VAUCLUSE' in GeoJSON:
     - suburb property: VAUCLUSE
     - suburbname property: Vaucluse
     - All properties with 'suburb' in name:
       suburbname: Vaucluse
       lga_suburb: Unincorporated-Vaucluse
       suburb: VAUCLUSE

   Found 'VAUCLUSE' in GeoJSON:
     - suburb property: VAUCLUSE
     - suburbname property: Vaucluse
     - All properties with 'suburb' in name:
       suburbname: Vaucluse
     

## Root Cause Found: API Pagination Limit

**Issue**: VAUCLUSE and POINT PIPER were missing from the frontend map because:

1. The `fetchSuburbSummaries()` function was calling `fetchAnalytics({ limit: 1000 })`
2. The database has **1,157 total records** (645 suburbs × ~2 property types)
3. Records are sorted alphabetically by suburb name
4. **VAUCLUSE is at position 1,057** - beyond the 1,000 record limit!
5. POINT PIPER is at position 857 - within limit, but may have been affected

**Fix Applied**: Updated `fetchSuburbSummaries()` in `final/frontend/app/lib/api.ts` to:
- Fetch all records via pagination (multiple requests of 1000 records each)
- Continue fetching until all records are retrieved
- This ensures all suburbs, including VAUCLUSE and POINT PIPER, are included

**Verification**:
- ✅ Both suburbs exist in database with data
- ✅ Both suburbs exist in GeoJSON
- ✅ Both suburbs are now included in API responses (after fix)
