# Geocoding Step for Hospital Data

This notebook adds latitude and longitude coordinates to hospital addresses using the Nominatim geocoding service.

**Important Notes:**
- This process respects OpenStreetMap's usage policy (1 request per second)
- Results are cached to avoid re-geocoding the same addresses
- The process can be resumed if interrupted
- Estimated time: ~75 minutes for 4088 addresses (if not cached)

In [43]:
# Import required libraries
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderUnavailable
import time
from tqdm import tqdm
import pickle
import os
import re

print("Libraries imported successfully!")

Libraries imported successfully!


In [44]:
# Configuration
CACHE_FILE = "geocode_cache.pkl"
INPUT_FILE = "us_er_transformed.csv"
OUTPUT_FILE = "us_er_transformed.csv"

print(f"Input file: {INPUT_FILE}")
print(f"Output file: {OUTPUT_FILE}")
print(f"Cache file: {CACHE_FILE}")

Input file: us_er_transformed.csv
Output file: us_er_transformed.csv
Cache file: geocode_cache.pkl


In [45]:
# Helper functions for caching
def load_cache():
    """Load geocoding cache from file"""
    if os.path.exists(CACHE_FILE):
        with open(CACHE_FILE, 'rb') as f:
            cache = pickle.load(f)
            print(f"[+] Loaded cache with {len(cache)} entries")
            return cache
    print("[*] No existing cache found, starting fresh")
    return {}

def save_cache(cache):
    """Save geocoding cache to file"""
    with open(CACHE_FILE, 'wb') as f:
        pickle.dump(cache, f)
    print(f"[+] Cache saved with {len(cache)} entries")

# Load existing cache
cache = load_cache()

[*] No existing cache found, starting fresh


In [46]:
# Geocoding function with retry logic
def geocode_address(row, geolocator, cache, max_retries=3):
    """
    Geocode an address with caching and retry logic.
    
    Args:
        row: DataFrame row containing 'full_address' and 'hospital_name'
        geolocator: Geopy geolocator instance
        cache: Dictionary cache of previously geocoded addresses
        max_retries: Maximum number of retry attempts
    
    Returns:
        Tuple of (latitude, longitude) or (None, None) if geocoding fails
    """
    # Extract and clean components
    hospital_name = str(row['hospital_name']).strip()
    full_addr = row['full_address']
    
    # Minor extra cleaning: Uppercase potential acronyms in city (e.g., 'Jber' -> 'JBER')
    # Split and rejoin full_addr to target city (assumes format: street, city, state zip, country)
    addr_parts = full_addr.split(', ')
    if len(addr_parts) >= 2:
        addr_parts[1] = addr_parts[1].upper() if len(addr_parts[1]) <= 5 else addr_parts[1]  # Uppercase short cities (acronyms)
        full_addr = ', '.join(addr_parts)
    
    # Clean street to remove potential redundant prefixes (e.g., if starts with number/acronym matching hospital)
    street = addr_parts[0]
    if re.match(r'^\d+ [A-Z]+', street):  # E.g., '673 Mdg'
        if hospital_name and any(part.lower() in hospital_name.lower() for part in street.split()[:2]):
            street = ' '.join(street.split()[2:])  # Remove first 1-2 tokens if match
            addr_parts[0] = street
            full_addr = ', '.join(addr_parts)
    
    # Build queries: First with name (for POI), then without (fallback for address-only)
    queries = []
    if hospital_name and hospital_name != 'nan':
        queries.append(f"{hospital_name}, {full_addr}")
    queries.append(full_addr)  # Always try pure address
    
    for query_addr in queries:
        # Check cache first
        if query_addr in cache:
            return cache[query_addr]
        
        for attempt in range(max_retries):
            try:
                # Add delay to respect Nominatim rate limits (1 request per second)
                time.sleep(1.1)
                loc = geolocator.geocode(query_addr, timeout=15, country_codes='us')
                if loc:
                    result = (loc.latitude, loc.longitude)
                    cache[query_addr] = result  # Cache success
                    return result
            except (GeocoderTimedOut, GeocoderUnavailable) as e:
                if attempt < max_retries - 1:
                    wait_time = (attempt + 1) * 3
                    time.sleep(wait_time)
            except Exception as e:
                pass  # Continue to next attempt or query
        
        cache[query_addr] = (None, None)  # Cache failure for this query
    
    return (None, None)

print("Geocoding function defined")

Geocoding function defined


In [47]:
# Utility functions for address cleaning
import re

def clean_address(address_str):
    """Clean address by removing PO Box, highway references, and other problematic patterns, and expand abbreviations"""
    if not isinstance(address_str, str):
        address_str = str(address_str)
    
    # Remove PO Box patterns (case insensitive)
    address_str = re.sub(r',?\s*P\.?\s*O\.?\s+Box\s+\d+', '', address_str, flags=re.IGNORECASE)
    address_str = re.sub(r',?\s*P\.?\s*O\.?\s+Box', '', address_str, flags=re.IGNORECASE)
    
    # Remove highway references (case insensitive)
    # Matches patterns like "Hwy ", "Highway ", "Hwy. ", etc. at the beginning
    address_str = re.sub(r'^\s*[Hh][Ii][Gg][Hh]?[Ww]?[Aa]?[Yy]?\.*\s*', '', address_str, flags=re.IGNORECASE)
    # Also remove "Hwy" or "HWY" followed by numbers at the beginning
    address_str = re.sub(r'^\s*[Hh][Ww][Yy]\.?\s*\d*', '', address_str, flags=re.IGNORECASE)
    
    # Remove multiple commas and extra spaces
    address_str = re.sub(r',\s*,', ',', address_str)
    address_str = re.sub(r'\s+', ' ', address_str)
    
    # Remove leading/trailing commas and spaces
    address_str = address_str.strip(' ,')
    
    # Expand common abbreviations (case insensitive, whole word match to avoid partial replacements)
    abbrev_dict = {
        'Ave': 'Avenue',
        'St': 'Street',
        'Dr': 'Drive',
        'Rd': 'Road',
        'Blvd': 'Boulevard',
        'Ln': 'Lane',
        'Ct': 'Court',
        'Pl': 'Place',
        'Sq': 'Square',
        'Hwy': 'Highway',
        'Pkwy': 'Parkway',
        'Cir': 'Circle',
        'Ter': 'Terrace',
        'Plz': 'Plaza',
        # Add more as needed, e.g., for directions: 'N': 'North', but only if part of street name
    }
    for abbrev, full in abbrev_dict.items():
        # Replace whole words (e.g., "Ave" but not in "Avenue")
        address_str = re.sub(r'\b' + re.escape(abbrev) + r'\b', full, address_str, flags=re.IGNORECASE)
    
    return address_str

def format_zip_code(zip_code):
    """Format zip code properly, removing .0 if present"""
    if pd.isna(zip_code) or zip_code == '' or str(zip_code).lower() == 'nan':
        return ''
    
    try:
        # Convert to string and remove .0 if present
        zip_str = str(zip_code)
        if '.' in zip_str and zip_str.endswith('.0'):
            zip_str = zip_str[:-2]
        return zip_str
    except:
        return str(zip_code)

print("Address cleaning functions defined")

Address cleaning functions defined


In [48]:
# Function to format full address properly
def format_full_address(row):
    """Create a properly formatted full address from individual components"""
    address = str(row['detail_address'])
    city = str(row['detail_city'])
    state = str(row['detail_state'])
    zip_code = row['detail_zip']  # Keep original type
    
    # Clean the address (remove PO Box, highways, and now expand abbrevs)
    address = clean_address(address)
    
    # Format zip code properly
    zip_code = format_zip_code(zip_code)
    
    # Build address in standard US format: Street, City, State ZIP, United States
    base_addr = f"{address}, {city}, {state}"
    if zip_code:
        base_addr += f" {zip_code}"
    
    # Apply final cleaning to the fully assembled address for extra safety
    return clean_address(base_addr)

print("Full address formatting function defined")

Full address formatting function defined


In [49]:
# Load the dataset
print(f"[*] Loading data from: {INPUT_FILE}")
df = pd.read_csv(INPUT_FILE)
print(f"[+] Loaded {len(df)} records")
print(f"[*] Columns: {list(df.columns)}")

# Check if already geocoded
if "lat" in df.columns and "lon" in df.columns:
    non_null_count = df[['lat', 'lon']].notna().all(axis=1).sum()
    print(f"[!] Dataset already has lat/lon columns ({non_null_count}/{len(df)} geocoded)")
else:
    print("[*] No lat/lon columns found, will add them")
    df['lat'] = None
    df['lon'] = None

# Create or recreate full address column with proper formatting
if "full_address" not in df.columns:
    print("[*] Creating full addresses...")
else:
    print("[*] Full address column already exists")
    print("[*] Recreating addresses with proper formatting...")

df["full_address"] = df.apply(format_full_address, axis=1)
print("[+] Full addresses created/reformatted")

# Show sample addresses
print("\n[*] Sample addresses to geocode:")
for addr in df["full_address"].head(5):
    print(f"  - {addr}")

[*] Loading data from: us_er_transformed.csv
[+] Loaded 4088 records
[*] Columns: ['state_abbr', 'county_name', 'hospital_name', 'city', 'wait_text', 'wait_minutes', 'detail_url', 'detail_name', 'detail_address', 'detail_city', 'detail_state', 'detail_zip', 'detail_phone', 'detail_hospital_type', 'detail_emergency_services', 'detail_mortality_overall_text', 'detail_mortality_overall_percent', 'detail_mortality_overall_direction', 'detail_mortality_heart_attack_percent', 'detail_mortality_stroke_percent', 'detail_mortality_heart_failure_percent', 'detail_mortality_pneumonia_percent', 'detail_c_diff_cases', 'detail_mrsa_cases', 'detail_avg_time_in_ed_minutes', 'detail_overall_patient_rating', 'detail_positive_patient_ratings', 'detail_negative_patient_ratings', 'source_url', 'scrape_ts', 'ed_minutes_rating', 'detail_overall_patient_rating_points', 'mortality_overall_contribution', 'total_quality_points', 'adj_total_heartattack', 'adj_total_stroke', 'adj_total_heartfailure', 'adj_total_pn

In [50]:
# Determine which addresses need geocoding
mask = df[['lat', 'lon']].isna().any(axis=1)
addresses_to_geocode = mask.sum()

# Estimate time
cached_count = sum(1 for addr in df.loc[mask, "full_address"] if addr in cache)
new_requests = addresses_to_geocode - cached_count
estimated_minutes = new_requests * 1.1 / 60

print(f"[*] Addresses to geocode: {addresses_to_geocode}")
print(f"    - {cached_count} will be loaded from cache")
print(f"    - {new_requests} will require API calls")
print(f"[*] Estimated time: {estimated_minutes:.1f} minutes")

if addresses_to_geocode == 0:
    print("\n[+] All addresses already geocoded!")
else:
    print(f"\n[!] Ready to geocode {addresses_to_geocode} addresses")

[*] Addresses to geocode: 586
    - 0 will be loaded from cache
    - 586 will require API calls
[*] Estimated time: 10.7 minutes

[!] Ready to geocode 586 addresses


In [51]:
# Geocode the addresses that need it
if addresses_to_geocode > 0:
    print("[*] Initializing geocoder (Nominatim/OpenStreetMap)...")
    geolocator = Nominatim(user_agent="hospital_geocoder_v1.0")
    print("[+] Geocoder initialized")
    
    print("\n" + "="*80)
    print("STARTING GEOCODING PROCESS")
    print("="*80)
    
    # Get rows that need geocoding
    rows_to_geocode = df[mask]
    
    # Progress bar
    progress_bar = tqdm(total=len(rows_to_geocode), desc="Geocoding")
    
    # Counter for periodic cache saves
    save_interval = 50  # Save cache every 50 geocodes
    processed_count = 0
    
    # Geocode each address that needs it
    for idx, row in rows_to_geocode.iterrows():
    # Apply clean_address function to ensure we're using the cleanest address
        full_address = clean_address(row["full_address"])
        lat, lon = geocode_address(row, geolocator, cache)  # Pass the row instead of address
        df.at[idx, 'lat'] = lat
        df.at[idx, 'lon'] = lon
    
        progress_bar.update(1)
        processed_count += 1
    
        # Periodically save cache
        if processed_count % save_interval == 0:
            save_cache(cache)
    
    progress_bar.close()
    
    # Save final cache and results
    save_cache(cache)
    df.to_csv(OUTPUT_FILE, index=False)
    print(f"[+] Results saved to {OUTPUT_FILE}")
    
    print("\n" + "="*80)
    print("GEOCODING PROCESS COMPLETE")
    print("="*80)

[*] Initializing geocoder (Nominatim/OpenStreetMap)...
[+] Geocoder initialized

STARTING GEOCODING PROCESS


Geocoding:   9%|▊         | 50/586 [11:53<2:01:07, 13.56s/it]

[+] Cache saved with 100 entries


Geocoding:  17%|█▋        | 100/586 [22:56<2:08:51, 15.91s/it]

[+] Cache saved with 199 entries


Geocoding:  26%|██▌       | 150/586 [35:12<1:49:20, 15.05s/it]

[+] Cache saved with 299 entries


Geocoding:  34%|███▍      | 200/586 [46:43<1:37:12, 15.11s/it]

[+] Cache saved with 398 entries


Geocoding:  43%|████▎     | 250/586 [1:01:33<1:39:03, 17.69s/it]

[+] Cache saved with 498 entries


Geocoding:  51%|█████     | 300/586 [1:15:18<1:03:26, 13.31s/it]

[+] Cache saved with 598 entries


Geocoding:  60%|█████▉    | 350/586 [1:27:35<1:03:12, 16.07s/it]

[+] Cache saved with 697 entries


Geocoding:  68%|██████▊   | 400/586 [1:40:05<47:15, 15.24s/it]  

[+] Cache saved with 797 entries


Geocoding:  77%|███████▋  | 450/586 [1:51:16<30:15, 13.35s/it]

[+] Cache saved with 897 entries


Geocoding:  85%|████████▌ | 500/586 [2:00:46<16:48, 11.73s/it]

[+] Cache saved with 996 entries


Geocoding:  94%|█████████▍| 550/586 [2:12:09<07:56, 13.24s/it]

[+] Cache saved with 1094 entries


Geocoding: 100%|██████████| 586/586 [2:20:00<00:00, 14.33s/it]

[+] Cache saved with 1166 entries
[+] Results saved to us_er_transformed.csv

GEOCODING PROCESS COMPLETE





In [54]:
# Check how many addresses failed
failed_mask = df[['lat', 'lon']].isna().any(axis=1)
failed_count = failed_mask.sum()

print(f"[*] Addresses that failed geocoding: {failed_count}")
print(f"[*] Success rate: {((len(df) - failed_count) / len(df) * 100):.1f}%")

if failed_count > 0:
    print(f"\n[*] Sample of failed addresses:")
    failed_sample = df[failed_mask][['hospital_name', 'detail_address', 'detail_city', 'detail_state','full_address','lat']].head(10)
    display(failed_sample)
else:
    print("\n[+] All addresses geocoded successfully!")

[*] Addresses that failed geocoding: 534
[*] Success rate: 86.9%

[*] Sample of failed addresses:


Unnamed: 0,hospital_name,detail_address,detail_city,detail_state,full_address,lat
0,673rd Medical Group (Joint Base Elmendorf-Rich...,673 Mdg 5955 Zeamer Ave,Jber,AK,"673 Mdg 5955 Zeamer Avenue, Jber, AK 99506",
8,Providence Seward Hospital,"417 First Avenue, Po Box 365",Seward,AK,"417 First Avenue, Seward, AK 99664",
16,South Baldwin Regional Medical Center,1613 North Mckenzie Street,Foley,AL,"1613 North Mckenzie Street, Foley, AL 36535",
27,Grove Hill Memorial Hospital,295 Jackson Hwy S,Grove Hill,AL,"295 Jackson Highway S, Grove Hill, AL 36451",
30,Clay County Hospital,83825 Highway 9 P O Box 1270,Ashland,AL,"83825 Highway 9, Ashland, AL 36251",
32,North Alabama Shoals Hospital,201 West Avalon Avenue,Muscle Shoals,AL,"201 West Avalon Avenue, Muscle Shoals, AL 35661",
34,Evergreen Medical Center,101 Crestview Avenue,Evergreen,AL,"101 Crestview Avenue, Evergreen, AL 36401",
38,Cullman Regional Medical Center,1912 Alabama Highway 157,Cullman,AL,"1912 Alabama Highway 157, Cullman, AL 35058",
46,Riverview Regional Medical Center,600 South Third Street,Gadsden,AL,"600 South Third Street, Gadsden, AL 35901",
48,Northwest Medical Center,1530 U S Highway 43,Winfield,AL,"1530 U S Highway 43, Winfield, AL 35594",


In [53]:
df[failed_mask].to_csv('failed_addresses.csv', index=False)