## OSM Data Collection Pipeline

In [18]:
import time
import requests
from shapely.geometry import shape, box

# Use a reliable Overpass endpoint
OVERPASS_URL = "https://overpass-api.de/api/interpreter"

def subdivide_bbox(s, w, n, e, side_deg=0.02):
    """
    Split a large bbox into smaller ones ~0.02° per side (~4 km²)
    """
    boxes = []
    lat = s
    while lat < n:
        lon = w
        next_lat = min(lat + side_deg, n)
        while lon < e:
            next_lon = min(lon + side_deg, e)
            boxes.append((lat, lon, next_lat, next_lon))
            lon = next_lon
        lat = next_lat
    return boxes

def make_query(s, w, n, e):
    return f"""
    [out:json][timeout:120];
    (
      node["leisure"="swimming_pool"]({s},{w},{n},{e});
      way["leisure"="swimming_pool"]({s},{w},{n},{e});
      relation["leisure"="swimming_pool"]({s},{w},{n},{e});
      node["swimming_pool"="yes"]({s},{w},{n},{e});
      way["swimming_pool"="yes"]({s},{w},{n},{e});
      relation["swimming_pool"="yes"]({s},{w},{n},{e});
    );
    out center tags;
    """

def request_overpass(s, w, n, e):
    """
    Query Overpass with bbox. If server errors, subdivide tile.
    """
    query = make_query(s, w, n, e)
    try:
        r = requests.post(OVERPASS_URL, data={"data": query}, timeout=180)
    except requests.exceptions.RequestException as err:
        print(f"    !!! Request failed: {err}")
        return []

    if r.status_code == 429:
        print("    !!! Rate limited (429). Sleeping 5s...")
        time.sleep(5)
        return request_overpass(s, w, n, e)

    if r.status_code == 504:
        print("    !!! Gateway Timeout (504). Splitting request...")
        # split further if too big
        mid_lat = (s+n)/2
        mid_lon = (w+e)/2
        return (
            request_overpass(s, w, mid_lat, mid_lon)
            + request_overpass(s, mid_lon, mid_lat, e)
            + request_overpass(mid_lat, w, n, mid_lon)
            + request_overpass(mid_lat, mid_lon, n, e)
        )

    r.raise_for_status()
    # Check if we got valid JSON and elements
    try:
        data = r.json()
        return data.get("elements", [])
    except Exception as e:
        print(f"    !!! Failed to parse JSON response: {e}")
        return []

def extract_pools(elements):
    results = []
    for elt in elements:
        if "lat" in elt and "lon" in elt:
            lat, lon = elt["lat"], elt["lon"]
        elif "center" in elt:
            lat, lon = elt["center"]["lat"], elt["center"]["lon"]
        else:
            continue

        results.append({
            "lat": lat,
            "lon": lon,
            "tags": elt.get("tags", {})
        })
    return results

def get_pools(geojson):
    poly = shape(geojson)
    # Correctly unpack bounds: (minx, miny, maxx, maxy) -> (West, South, East, North)
    w, s, e, n = poly.bounds
    print(f"Polygon bounds: South={s:.4f}, West={w:.4f}, North={n:.4f}, East={e:.4f}")

    tiles = subdivide_bbox(s, w, n, e)
    print(f"Subdivided area into {len(tiles)} tiles.")

    all_pools = []
    seen = set()
    
    # Pre-filter tiles to print accurate progress count
    tiles_to_process = []
    for (ts, tw, tn, te) in tiles:
        tile = box(tw, ts, te, tn)
        if poly.intersects(tile):
            tiles_to_process.append((ts, tw, tn, te))
    
    print(f"Processing {len(tiles_to_process)} tiles intersecting the polygon...")

    for i, (ts, tw, tn, te) in enumerate(tiles_to_process, 1):
        print(f"  [Tile {i}/{len(tiles_to_process)}] Querying bbox ({ts:.4f}, {tw:.4f}, {tn:.4f}, {te:.4f})...")
        
        elements = request_overpass(ts, tw, tn, te)
        pools = extract_pools(elements)
        
        if elements:
            print(f"    -> Found {len(elements)} raw elements, extracted {len(pools)} pools.")
        
        new_count = 0
        for p in pools:
            key = (p["lat"], p["lon"])
            if key not in seen:
                seen.add(key)
                all_pools.append(p)
                new_count += 1
        
        if new_count > 0:
            print(f"    -> Added {new_count} new unique pools.")

    return all_pools


In [19]:
sample_polygon = [
    [-81.01262, 43.48260],
    [-81.08653, 43.35806],
    [-80.99489, 43.23542],
    [-80.60763, 43.25911],
    [-80.45686, 43.38814],
    [-80.78204, 43.46758],
    [-81.01262, 43.48260],
]

sample_geojson = {"type":"Polygon","coordinates":[sample_polygon]}

if __name__ == "__main__":
    pools = get_pools(sample_geojson)
    print(f"Found {len(pools)} pools")
    for p in pools:
        print(p)


Polygon bounds: South=43.2354, West=-81.0865, North=43.4826, East=-80.4569
Subdivided area into 416 tiles.
Processing 311 tiles intersecting the polygon...
  [Tile 1/311] Querying bbox (43.2354, -81.0265, 43.2554, -81.0065)...
  [Tile 2/311] Querying bbox (43.2354, -81.0065, 43.2554, -80.9865)...
  [Tile 3/311] Querying bbox (43.2354, -80.9865, 43.2554, -80.9665)...
  [Tile 4/311] Querying bbox (43.2354, -80.9665, 43.2554, -80.9465)...
  [Tile 5/311] Querying bbox (43.2354, -80.9465, 43.2554, -80.9265)...
  [Tile 6/311] Querying bbox (43.2354, -80.9265, 43.2554, -80.9065)...
  [Tile 7/311] Querying bbox (43.2354, -80.9065, 43.2554, -80.8865)...
  [Tile 8/311] Querying bbox (43.2354, -80.8865, 43.2554, -80.8665)...
  [Tile 9/311] Querying bbox (43.2354, -80.8665, 43.2554, -80.8465)...
    !!! Gateway Timeout (504). Splitting request...
  [Tile 10/311] Querying bbox (43.2354, -80.8465, 43.2554, -80.8265)...
  [Tile 11/311] Querying bbox (43.2354, -80.8265, 43.2554, -80.8065)...
  [Tile 1

In [20]:
import pandas as pd
pools_df = pd.DataFrame(pools)
print(pools_df.head())

         lat        lon                                               tags
0  43.312810 -80.611900  {'access': 'private', 'building': 'yes', 'leis...
1  43.329218 -80.809291                       {'leisure': 'swimming_pool'}
2  43.328912 -80.807952                       {'leisure': 'swimming_pool'}
3  43.368118 -81.001000                       {'leisure': 'swimming_pool'}
4  43.370764 -80.975889   {'leisure': 'swimming_pool', 'natural': 'water'}


In [21]:
pools_df.to_csv("osm_sample_output.csv", index=False)

In [32]:
import pandas as pd
pools_df = pd.read_csv("osm_sample_output.csv")
pools_dict = pools_df.to_dict(orient='records')
print(pools_dict[:5])

[{'lat': 43.31281, 'lon': -80.6119001, 'tags': "{'access': 'private', 'building': 'yes', 'leisure': 'swimming_pool'}"}, {'lat': 43.3292184, 'lon': -80.8092907, 'tags': "{'leisure': 'swimming_pool'}"}, {'lat': 43.3289124, 'lon': -80.8079521, 'tags': "{'leisure': 'swimming_pool'}"}, {'lat': 43.3681181, 'lon': -81.0010002, 'tags': "{'leisure': 'swimming_pool'}"}, {'lat': 43.3707635, 'lon': -80.9758888, 'tags': "{'leisure': 'swimming_pool', 'natural': 'water'}"}]


In [33]:
pools = pools_dict

## Step 2: Address Option Collection

In [None]:
import os
import requests
import time
import json

def get_possible_addresses(lat, lon, api_key):
    """
    Reverse geocode a lat/lon using Google Maps API.
    Returns a list of enriched address objects including simplified components and lat/lon.
    """
    base_url = "https://maps.googleapis.com/maps/api/geocode/json"
    params = {
        "latlng": f"{lat},{lon}",
        "key": api_key
    }
    
    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        data = response.json()
        
        if data["status"] == "OK":
            enriched_results = []
            for result in data["results"]:
                location = result.get("geometry", {}).get("location", {})
                raw_components = result.get("address_components", [])
                
                # Parse and simplify components
                parsed_components = {
                    "address_number": None,
                    "street_name": None,
                    "locality": None,
                    "county": None,
                    "province_state": None,
                    "postal_code": None,
                    "country": None
                }
                
                for comp in raw_components:
                    types = comp.get("types", [])
                    if "street_number" in types:
                        parsed_components["address_number"] = comp["long_name"]
                    elif "route" in types:
                        parsed_components["street_name"] = comp["long_name"]
                    elif "locality" in types:
                        parsed_components["locality"] = comp["long_name"]
                    elif "administrative_area_level_2" in types:
                        parsed_components["county"] = comp["long_name"]
                    elif "administrative_area_level_1" in types:
                        parsed_components["province_state"] = comp["long_name"]
                    elif "postal_code" in types:
                        parsed_components["postal_code"] = comp["long_name"]
                    elif "country" in types:
                        parsed_components["country"] = comp["long_name"]
                    elif "administrative_area_level_3" in types and not parsed_components["locality"]:
                        # Sometimes level 3 can be a city if locality is missing
                        parsed_components["locality"] = comp["long_name"]

                enriched_results.append({
                    "formatted_address": result.get("formatted_address"),
                    "address_components": parsed_components,     # Simplified dictionary
                    "raw_components": raw_components,            # Original list
                    "place_id": result.get("place_id"),
                    "location_type": result.get("geometry", {}).get("location_type"),
                    "types": result.get("types"),
                    "lat": location.get("lat"),
                    "lon": location.get("lng")
                })
            return enriched_results
            
        elif data["status"] == "ZERO_RESULTS":
            print(f"  No address found for {lat}, {lon}")
            return []
        else:
            print(f"  Geocoding error for {lat}, {lon}: {data['status']}")
            if "error_message" in data:
                print(f"  Error message: {data['error_message']}")
            return []
            
    except Exception as e:
        print(f"  Request error: {e}")
        return []

def collect_addresses_for_pools(pools, api_key):
    """
    Iterates through pools and adds 'possible_addresses' field.
    """
    enriched_pools = []
    total = len(pools)
    
    print(f"Starting address collection for {total} pools...")
    
    for i, pool in enumerate(pools, 1):
        lat, lon = pool["lat"], pool["lon"]
        print(f"[{i}/{total}] Geocoding {lat:.5f}, {lon:.5f}...")
        
        address_results = get_possible_addresses(lat, lon, api_key)
        
        pool_copy = pool.copy()
        pool_copy["possible_addresses"] = address_results
        enriched_pools.append(pool_copy)
        
        if address_results:
             best = address_results[0]
             print(f"  -> Found {len(address_results)} options. Best match: {best.get('formatted_address')}")
        else:
             print("  -> No addresses returned.")
        
        # Respect API rate limits
        time.sleep(0.1)
        
    return enriched_pools

# Retrieve API key from environment variable
GOOGLE_API_KEY = os.environ.get("GOOGLE_MAPS_API_KEY", "")

if __name__ == "__main__":
    if GOOGLE_API_KEY == "NOTHING":
        print("WARNING: Please set your GOOGLE_MAPS_API_KEY environment variable or replace the placeholder string.")
    else:
        # Assuming 'pools' variable is available from previous cells
        if 'pools' in locals() and pools:
             print("Test run on first 3 pools:")
             # Limit to first 3 for testing to avoid excessive API usage/cost during dev
              
             enriched_sample = collect_addresses_for_pools(pools, GOOGLE_API_KEY)
             
             print("\n--- Results ---")
             for p in enriched_sample:
                 print(f"Pool: {p['lat']}, {p['lon']}")
                 if p.get('possible_addresses'):
                     first_match = p['possible_addresses'][0]
                     print(f"Best Address: {first_match['formatted_address']}")
                     print(f"Address Coords: {first_match.get('lat')}, {first_match.get('lon')}")
                     
                     comps = first_match['address_components']
                     print(f"Parsed Components:")
                     print(f"  Number: {comps.get('address_number')}")
                     print(f"  Street: {comps.get('street_name')}")
                     print(f"  City:   {comps.get('locality')}")
                     print(f"  State:  {comps.get('province_state')}")
                 else:
                     print("No address found")
                 print("-" * 20)
        else:
            print("No 'pools' variable found. Run the OSM collection steps first.")

Test run on first 3 pools:
Starting address collection for 117 pools...
[1/117] Geocoding 43.31281, -80.61190...
  -> Found 12 options. Best match: 876764 Hofstetter Rd, Plattsville, ON N0J 1S0, Canada
[2/117] Geocoding 43.32922, -80.80929...
  -> Found 12 options. Best match: 657185 15th Line, Tavistock, ON N0B 2R0, Canada
[3/117] Geocoding 43.32891, -80.80795...
  -> Found 12 options. Best match: 657174 15th Line, Tavistock, ON N0B 2R0, Canada
[4/117] Geocoding 43.36812, -81.00100...
  -> Found 14 options. Best match: 639 W Gore St, Stratford, ON N5A 1L4, Canada
[5/117] Geocoding 43.37076, -80.97589...
  -> Found 14 options. Best match: 176 Albert St, Stratford, ON N5A 3K6, Canada
[6/117] Geocoding 43.37290, -80.98428...
  -> Found 13 options. Best match: 15 William St, Stratford, ON N5A 4X9, Canada
[7/117] Geocoding 43.37278, -80.98457...
  -> Found 14 options. Best match: 15 William St, Stratford, ON N5A 4X9, Canada
[8/117] Geocoding 43.37134, -80.93338...
  -> Found 12 options. Be

In [36]:
# now we extract all of the actual addresses, and put them into their own df for passing into the validation system


def extract_all_addresses(pools):
    all_addresses = []
    for pool in pools:
        for addr in pool.get("possible_addresses", []):
            components = addr.get("address_components", {})
            all_addresses.append({
                "lat": addr.get("lat"),
                "lon": addr.get("lon"),
                "formatted_address": addr.get("formatted_address"),
                "address_number": components.get("address_number"),
                "street_name": components.get("street_name"),
                "locality": components.get("locality"),
                "province_state": components.get("province_state"),
                "postal_code": components.get("postal_code"),
                "country": components.get("country")
            })
    
    # drop all rows where address_number is none, or address_number contains a range (ie. "123-125")
    all_addresses = [a for a in all_addresses if a["address_number"] and not any(c in a["address_number"] for c in "-–")]
    
    #drop duplicates based on formatted_address column
    seen = set()
    unique_addresses = []
    for addr in all_addresses:
        if addr["formatted_address"] not in seen:
            seen.add(addr["formatted_address"])
            unique_addresses.append(addr)
    print(f"Extracted {len(all_addresses)} addresses, {len(unique_addresses)} unique after filtering.")
    
    return unique_addresses  
addresses = extract_all_addresses(enriched_sample)
addresses_df = pd.DataFrame(addresses)
addresses_df.head(10)

Extracted 351 addresses, 280 unique after filtering.


Unnamed: 0,lat,lon,formatted_address,address_number,street_name,locality,province_state,postal_code,country
0,43.312626,-80.612285,"876764 Hofstetter Rd, Plattsville, ON N0J 1S0,...",876764,Hofstetter Road,Plattsville,Ontario,N0J 1S0,Canada
1,43.313193,-80.612254,"876768 Hofstetter Rd, Plattsville, ON N0J 1S0,...",876768,Hofstetter Road,Plattsville,Ontario,N0J 1S0,Canada
2,43.312711,-80.612412,"876746 Hofstetter Rd, Plattsville, ON N0J 1S0,...",876746,Hofstetter Road,Plattsville,Ontario,N0J 1S0,Canada
3,43.310711,-80.613081,"48 English Cres, Blandford-Blenheim, ON N0J 1S...",48,English Crescent,Blandford-Blenheim,Ontario,N0J 1S0,Canada
4,43.329267,-80.809103,"657185 15th Line, Tavistock, ON N0B 2R0, Canada",657185,15th Line,Tavistock,Ontario,N0B 2R0,Canada
5,43.329071,-80.809737,"985420 Perth-Oxford Rd, Tavistock, ON N0B 2R0,...",985420,Perth-Oxford Road,Tavistock,Ontario,N0B 2R0,Canada
6,43.329242,-80.80834,"985434 Perth-Oxford Rd, Tavistock, ON N0B 2R0,...",985434,Perth-Oxford Road,Tavistock,Ontario,N0B 2R0,Canada
7,43.329374,-80.808733,"657185 15th Line, East Zorra-Tavistock, ON N0B...",657185,15th Line,East Zorra-Tavistock,Ontario,N0B 2R0,Canada
8,43.328785,-80.808074,"657174 15th Line, Tavistock, ON N0B 2R0, Canada",657174,15th Line,Tavistock,Ontario,N0B 2R0,Canada
9,43.328644,-80.808334,"657175 15th Line, East Zorra-Tavistock, ON N0B...",657175,15th Line,East Zorra-Tavistock,Ontario,N0B 2R0,Canada


In [46]:
from shapely.geometry import Point

def upsert_addresses_to_stage_db(address_df, conn):
    if 'geom' not in address_df.columns or address_df['geom'].dtype == 'object':
        address_df['geom'] = address_df.apply(lambda row: Point(row['lon'], row['lat']), axis=1)
    
    # Rename locality to municipality
    address_df.rename(columns={"locality": "municipality"}, inplace=True)
    
    # Select and order columns for insertion (no address_id - DB generates it)
    columns_address_df = ['address_number', 'lat', 'lon', 'postal_code', 
                          'street_name', 'province_state', 'country', 'municipality']
    address_df = address_df[columns_address_df]
    
    # Insert into address table (deduplication handled pre-insert)
    cursor = conn.cursor()
    inserted_count = 0
    
    insert_sql = """
    INSERT INTO address (address_number, lat, lon, postal_code, street_name, 
                        province_state, country, geom, municipality)
    VALUES (%s, %s, %s, %s, %s, %s, %s, ST_SetSRID(ST_MakePoint(%s, %s), 4326), %s)
    RETURNING id;
    """
    
    try:
        for idx, row in address_df.iterrows():
            cursor.execute(insert_sql, (
                row['address_number'],
                row['lat'],
                row['lon'],
                row['postal_code'],
                row['street_name'],
                row['province_state'],
                row['country'],
                row['lon'],  # For ST_MakePoint
                row['lat'],  # For ST_MakePoint
                row['municipality']
            ))
            cursor.fetchone()  # Get the returned id
            inserted_count += 1
        
        conn.commit()
        print(f"Inserted {inserted_count} addresses successfully.")
        return inserted_count
        
    except Exception as e:
        conn.rollback()
        print(f"Error during insert: {e}")
        raise
    finally:
        cursor.close()

def upsert_pools_to_stage_db(pools_df, conn):
    """
    Insert pools into the pool table (deduplication handled pre-insert).
    Expects pools_df to have 'lat', 'lon', and optionally 'tags' or 'pool_type' columns.
    """
    # Extract pool_type from tags if it exists, otherwise default to 'unknown'
    if 'pool_type' not in pools_df.columns:
        if 'tags' in pools_df.columns:
            pools_df['pool_type'] = pools_df['tags'].apply(
                lambda x: x.get('leisure', x.get('swimming_pool', 'unknown')) if isinstance(x, dict) else 'unknown'
            )
        else:
            pools_df['pool_type'] = 'unknown'
    
    # Select columns for insertion (no pool_id - DB generates it)
    columns_pool_df = ['lat', 'lon', 'pool_type']
    pools_df = pools_df[columns_pool_df]
    
    # Insert into pool table
    cursor = conn.cursor()
    inserted_count = 0
    
    insert_sql = """
    INSERT INTO pool (lat, lon, pool_type)
    VALUES (%s, %s, %s)
    RETURNING id;
    """
    
    try:
        for idx, row in pools_df.iterrows():
            cursor.execute(insert_sql, (
                row['lat'],
                row['lon'],
                row['pool_type']
            ))
            cursor.fetchone()  # Get the returned id
            inserted_count += 1
        
        conn.commit()
        print(f"Inserted {inserted_count} pools successfully.")
        return inserted_count
        
    except Exception as e:
        conn.rollback()
        print(f"Error during insert: {e}")
        raise
    finally:
        cursor.close()

In [None]:
import psycopg2
NEON_DSN = ''
try:
    conn = psycopg2.connect(NEON_DSN)
    print("Connected to Neon database successfully.")
    
    # Upsert addresses first to ensure we have address_ids for any future relationships
    if 'addresses_df' in locals():
        upsert_addresses_to_stage_db(addresses_df, conn)
    else:
        print("No addresses_df found. Skipping address upsert.")
    
    # Upsert pools
    if 'pools_df' in locals():
        upsert_pools_to_stage_db(pools_df, conn)
    else:
        print("No pools_df found. Skipping pool upsert.")

except Exception as e:
    print(f"Database connection or upsert error: {e}")
finally:   
        conn.close()
        print("Database connection closed.")

Connected to Neon database successfully.
Inserted 280 addresses successfully.
Inserted 117 pools successfully.
Database connection closed.


In [38]:
pools_df.head()

Unnamed: 0,lat,lon,tags
0,43.31281,-80.6119,"{'access': 'private', 'building': 'yes', 'leis..."
1,43.329218,-80.809291,{'leisure': 'swimming_pool'}
2,43.328912,-80.807952,{'leisure': 'swimming_pool'}
3,43.368118,-81.001,{'leisure': 'swimming_pool'}
4,43.370764,-80.975889,"{'leisure': 'swimming_pool', 'natural': 'water'}"
