# Generate Predictor Features for Routing Disagreement Prediction

This notebook calculates the 6 predictor features required for the routing disagreement models:

1. **Straight_Line_Distance_m** - Haversine distance between origin and destination
2. **Origin_Road_Length_Density_m_km2** - Road network density around origin
3. **Dest_Intersection_Density_n_km2** - Intersection density around destination
4. **Slope_Pct** - Slope percentage between origin and destination
5. **Elevation_Difference_m** - Elevation change between origin and destination
6. **Population** - Census block group population at origin

## Requirements
- CSV file with origin/destination coordinates (latitude, longitude)
- Google Earth Engine account (free): https://earthengine.google.com/
- US Census API key (free): https://api.census.gov/data/key_signup.html

## Input Format
Your CSV should have columns for origin and destination coordinates. Update the column names in the Configuration section below.

## Step 0: Install Dependencies

In [None]:
# Install required packages
!pip install osmnx geopandas shapely pyproj tqdm pandas numpy geopy earthengine-api census us pygris -q

print("✓ Dependencies installed")

## Step 1: Configuration

**⚠️ UPDATE THESE SETTINGS FOR YOUR DATA**

In [None]:
#@title Configuration Settings { display-mode: "form" }

# ============================================================================
# FILE PATHS - Update these for your environment
# ============================================================================
INPUT_CSV = '/content/your_od_pairs.csv'  #@param {type:"string"}
OUTPUT_CSV = '/content/data_with_predictors.csv'  #@param {type:"string"}

# ============================================================================
# COLUMN NAMES - Update to match your CSV column names
# ============================================================================
COL_ORIGIN_LAT = 'origin_lat'  #@param {type:"string"}
COL_ORIGIN_LON = 'origin_lon'  #@param {type:"string"}
COL_DEST_LAT = 'dest_lat'  #@param {type:"string"}
COL_DEST_LON = 'dest_lon'  #@param {type:"string"}

# ============================================================================
# API KEYS - Get these for free from the links above
# ============================================================================
CENSUS_API_KEY = 'YOUR_CENSUS_API_KEY_HERE'  #@param {type:"string"}
GEE_PROJECT_ID = 'YOUR_GEE_PROJECT_ID'  #@param {type:"string"}

# ============================================================================
# CENSUS GEOGRAPHY - Update for your study area
# Find FIPS codes at: https://www.census.gov/library/reference/code-lists/ansi.html
# ============================================================================
STATE_FIPS = '13'  #@param {type:"string"}
COUNTY_FIPS = '059'  #@param {type:"string"}
CENSUS_YEAR = 2020  #@param {type:"integer"}

# ============================================================================
# PROCESSING PARAMETERS (defaults should work for most cases)
# ============================================================================
BUFFER_M = 400  # Walking catchment buffer in meters
CRS_METRIC = 3857  # EPSG:3857 Web Mercator for metric calculations
BATCH_SIZE = 50  # Rows per batch for network metrics
MAX_WORKERS = 4  # Parallel threads

print("Configuration loaded!")
print(f"  Input: {INPUT_CSV}")
print(f"  Output: {OUTPUT_CSV}")
print(f"  Study area: State {STATE_FIPS}, County {COUNTY_FIPS}")

## Step 2: Initialize Libraries and Authenticate

In [None]:
import os
import warnings
import pandas as pd
import numpy as np
import geopandas as gpd
import osmnx as ox
from shapely.geometry import Point
from geopy.distance import geodesic
from tqdm.auto import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import Counter
import ee
from census import Census
import pygris

warnings.filterwarnings('ignore')

# Configure OSMnx
ox.settings.log_console = False
ox.settings.use_cache = True

print("✓ Libraries imported")

In [None]:
# Authenticate with Google Earth Engine
# This will open a browser window for authentication
ee.Authenticate()
ee.Initialize(project=GEE_PROJECT_ID)

print("✓ Google Earth Engine initialized")

## Step 3: Load Your Data

In [None]:
# Load your O-D pairs
df = pd.read_csv(INPUT_CSV)

print(f"✓ Loaded {len(df):,} O-D pairs")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# Verify required columns exist
required_cols = [COL_ORIGIN_LAT, COL_ORIGIN_LON, COL_DEST_LAT, COL_DEST_LON]
missing = [col for col in required_cols if col not in df.columns]

if missing:
    raise ValueError(f"Missing required columns: {missing}\n"
                     f"Available columns: {list(df.columns)}\n"
                     f"Please update the column name configuration above.")
else:
    print("✓ All required columns found")

## Step 4: Generate Network Predictors

This step calculates:
- **Straight_Line_Distance_m** - Haversine distance
- **Origin_Road_Length_Density_m_km2** - Road density at origin
- **Dest_Intersection_Density_n_km2** - Intersection density at destination

⏱️ This may take a while for large datasets due to OSM API queries.

In [None]:
# Helper functions
def area_km2(geom3857):
    """Return area of a projected geometry (EPSG:3857) in km²"""
    return geom3857.area / 1_000_000

def buffer_from_latlon(lat, lon):
    """Create a 400m buffer around a point in metric CRS"""
    pt = Point(lon, lat)
    return gpd.GeoSeries([pt], crs=4326).to_crs(CRS_METRIC).buffer(BUFFER_M)[0]

def calculate_haversine(row):
    """Calculate haversine distance between origin and destination"""
    origin = (row[COL_ORIGIN_LAT], row[COL_ORIGIN_LON])
    destination = (row[COL_DEST_LAT], row[COL_DEST_LON])
    return geodesic(origin, destination).meters

def net_metrics(buf_metric):
    """
    Compute network predictors inside the buffer (EPSG:3857):
      - intersect_density: true ≥3-way intersections per km²
      - road_len_density: total walkable road length (m) per km²
    """
    intersect_density = road_len_density = 0.0
    area = area_km2(buf_metric)
    
    # Re-project buffer to WGS84 for OSMnx queries
    buf_wgs = gpd.GeoSeries([buf_metric], crs=CRS_METRIC).to_crs(4326).iloc[0]
    
    try:
        G = ox.graph_from_polygon(
            buf_wgs,
            network_type="drive_service",
            simplify=True,
            retain_all=False
        )
        
        if len(G.nodes) and len(G.edges):
            nodes, edges = ox.graph_to_gdfs(G, nodes=True, edges=True)
            
            # Road length density
            edges_m = edges.to_crs(CRS_METRIC)
            road_len_m = edges_m.geometry.length.sum()
            road_len_density = road_len_m / area
            
            # Intersection density
            nodes_m = nodes.to_crs(CRS_METRIC)
            nodes_cl = gpd.clip(nodes_m, buf_metric)
            
            if "street_count" in nodes_cl.columns:
                inter_cnt = (nodes_cl["street_count"] >= 3).sum()
            else:
                deg = Counter([u for u, v, k in G.edges(keys=True)] +
                            [v for u, v, k in G.edges(keys=True)])
                inter_cnt = sum(1 for n in nodes_cl.index if deg.get(n, 0) >= 3)
            
            intersect_density = inter_cnt / area
            
    except Exception:
        pass
    
    return intersect_density, road_len_density

print("✓ Network metric functions defined")

In [None]:
# Calculate straight-line distance
print("Calculating straight-line distances...")
tqdm.pandas(desc="Computing distances")
df['Straight_Line_Distance_m'] = df.progress_apply(calculate_haversine, axis=1)

print(f"\n✓ Distance range: [{df['Straight_Line_Distance_m'].min():.1f}, "
      f"{df['Straight_Line_Distance_m'].max():.1f}] meters")

In [None]:
# Build 400m buffers
print("Building 400m buffers...")

tqdm.pandas(desc="Origin buffers")
df['buf_origin'] = df.progress_apply(
    lambda r: buffer_from_latlon(r[COL_ORIGIN_LAT], r[COL_ORIGIN_LON]),
    axis=1
)

tqdm.pandas(desc="Destination buffers")
df['buf_dest'] = df.progress_apply(
    lambda r: buffer_from_latlon(r[COL_DEST_LAT], r[COL_DEST_LON]),
    axis=1
)

print("\n✓ Buffers created")

In [None]:
# Calculate network metrics
print(f"Processing network metrics for {len(df):,} O-D pairs...")
print(f"This may take a while. Progress will be shown below.")

# Initialize columns
df['Origin_Road_Length_Density_m_km2'] = 0.0
df['Origin_Intersection_Density_n_km2'] = 0.0
df['Dest_Road_Length_Density_m_km2'] = 0.0
df['Dest_Intersection_Density_n_km2'] = 0.0

def process_row(i, row):
    """Wrapper for parallel processing"""
    try:
        id_o, rd_o = net_metrics(row.buf_origin)
        id_d, rd_d = net_metrics(row.buf_dest)
        return i, id_o, rd_o, id_d, rd_d, None
    except Exception as e:
        return i, 0.0, 0.0, 0.0, 0.0, str(e)

# Process in batches with progress bar
total = len(df)
success_count = 0
error_count = 0

with tqdm(total=total, desc="Processing network metrics") as pbar:
    for batch_start in range(0, total, BATCH_SIZE):
        batch_end = min(batch_start + BATCH_SIZE, total)
        
        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            futures = {
                executor.submit(process_row, i, df.iloc[i]): i
                for i in range(batch_start, batch_end)
            }
            
            for fut in as_completed(futures):
                i, id_o, rd_o, id_d, rd_d, err = fut.result()
                if err:
                    error_count += 1
                else:
                    df.iloc[i, df.columns.get_loc('Origin_Intersection_Density_n_km2')] = id_o
                    df.iloc[i, df.columns.get_loc('Origin_Road_Length_Density_m_km2')] = rd_o
                    df.iloc[i, df.columns.get_loc('Dest_Intersection_Density_n_km2')] = id_d
                    df.iloc[i, df.columns.get_loc('Dest_Road_Length_Density_m_km2')] = rd_d
                    success_count += 1
                pbar.update(1)

print(f"\n✓ Network metrics complete")
print(f"  Success: {success_count:,}")
print(f"  Errors: {error_count}")

## Step 5: Generate Topographic Predictors

This step calculates:
- **Elevation_Difference_m** - Elevation change from origin to destination
- **Slope_Pct** - Slope as a percentage of horizontal distance

In [None]:
# Google Earth Engine setup
DEM = ee.Image('USGS/SRTMGL1_003')  # 30m SRTM DEM

def ee_sample_points(img, points_gdf):
    """
    Sample an Earth Engine image at given point locations.
    """
    fc = ee.FeatureCollection([
        ee.Feature(ee.Geometry.Point(pt.x, pt.y))
        for pt in points_gdf.geometry
    ])
    
    sampled = img.sampleRegions(
        collection=fc,
        scale=30,
        tileScale=4
    )
    
    band_name = img.bandNames().get(0)
    values = sampled.aggregate_array(band_name).getInfo()
    return np.array(values, dtype=float)

print("✓ Topographic functions defined")

In [None]:
# Process topographic data in batches
print("Sampling elevation data from SRTM DEM...")

df['Elevation_Origin_m'] = np.nan
df['Elevation_Dest_m'] = np.nan
df['Elevation_Difference_m'] = np.nan
df['Slope_Pct'] = np.nan

GEE_BATCH_SIZE = 500  # GEE handles larger batches well
total = len(df)

with tqdm(total=total, desc="Processing elevation data") as pbar:
    for batch_start in range(0, total, GEE_BATCH_SIZE):
        batch_end = min(batch_start + GEE_BATCH_SIZE, total)
        batch_idx = slice(batch_start, batch_end)
        
        try:
            # Build point geometries
            pts_origin = gpd.GeoSeries([
                Point(lon, lat) for lon, lat in zip(
                    df.loc[batch_idx, COL_ORIGIN_LON],
                    df.loc[batch_idx, COL_ORIGIN_LAT]
                )
            ], crs=4326)
            
            pts_dest = gpd.GeoSeries([
                Point(lon, lat) for lon, lat in zip(
                    df.loc[batch_idx, COL_DEST_LON],
                    df.loc[batch_idx, COL_DEST_LAT]
                )
            ], crs=4326)
            
            # Sample elevation
            elev_origin = ee_sample_points(DEM, gpd.GeoDataFrame(geometry=pts_origin))
            elev_dest = ee_sample_points(DEM, gpd.GeoDataFrame(geometry=pts_dest))
            
            # Calculate derived metrics
            delta_elev = elev_dest - elev_origin
            euclidean_dist = df.loc[batch_idx, 'Straight_Line_Distance_m'].values
            euclidean_dist = np.where(euclidean_dist == 0, np.nan, euclidean_dist)
            slope_pct = (delta_elev / euclidean_dist) * 100
            
            # Assign results
            df.loc[batch_idx, 'Elevation_Origin_m'] = elev_origin
            df.loc[batch_idx, 'Elevation_Dest_m'] = elev_dest
            df.loc[batch_idx, 'Elevation_Difference_m'] = delta_elev
            df.loc[batch_idx, 'Slope_Pct'] = slope_pct
            
        except Exception as e:
            print(f"\nWarning: Error in batch {batch_start}-{batch_end}: {e}")
        
        pbar.update(batch_end - batch_start)

print("\n✓ Topographic data complete")

## Step 6: Generate Demographic Predictors

This step calculates:
- **Population** - Census block group population at origin

⚠️ This uses US Census data. If your study area is outside the US, you'll need to substitute an appropriate population data source.

In [None]:
# Download Census block group boundaries
print(f"Downloading Census block groups for State {STATE_FIPS}, County {COUNTY_FIPS}...")

try:
    block_groups = pygris.block_groups(
        state=STATE_FIPS,
        county=COUNTY_FIPS,
        year=CENSUS_YEAR,
        cache=True
    )
    block_groups = block_groups.to_crs(CRS_METRIC)
    print(f"✓ Downloaded {len(block_groups)} block groups")
except Exception as e:
    print(f"Error downloading block groups: {e}")
    print("Check your FIPS codes and internet connection.")
    raise

In [None]:
# Fetch population data from Census API
print("Fetching population data from Census API...")

try:
    c = Census(CENSUS_API_KEY)
    
    pop_data = c.pl.get(
        ('P1_001N',),  # Total population
        geo={
            'for': 'block group:*',
            'in': f'state:{STATE_FIPS} county:{COUNTY_FIPS}'
        },
        year=CENSUS_YEAR
    )
    
    pop_df = pd.DataFrame(pop_data)
    pop_df = pop_df.rename(columns={'P1_001N': 'Population'})
    pop_df['GEOID'] = (
        pop_df['state'] +
        pop_df['county'] +
        pop_df['tract'] +
        pop_df['block group']
    )
    pop_df = pop_df[['GEOID', 'Population']].astype({'Population': float})
    
    print(f"✓ Population range: [{pop_df['Population'].min():.0f}, {pop_df['Population'].max():.0f}]")
    
except Exception as e:
    print(f"Error fetching Census data: {e}")
    print("Check your Census API key.")
    raise

In [None]:
# Spatial join to assign population to each origin
print("Performing spatial join...")

# Merge population with block group geometries
block_groups = block_groups.merge(pop_df, on='GEOID', how='left')
block_groups['Population'] = block_groups['Population'].fillna(0)

# Create GeoDataFrame of origin points
geometry = [
    Point(lon, lat)
    for lon, lat in zip(df[COL_ORIGIN_LON], df[COL_ORIGIN_LAT])
]
gdf_origins = gpd.GeoDataFrame(df, geometry=geometry, crs=4326)
gdf_origins = gdf_origins.to_crs(CRS_METRIC)

# Spatial join
gdf_joined = gdf_origins.sjoin(
    block_groups[['GEOID', 'Population', 'geometry']],
    how='left',
    predicate='within'
)

df['Population'] = gdf_joined['Population'].values
df['Population'] = df['Population'].fillna(0)

n_matched = (df['Population'] > 0).sum()
print(f"✓ Matched {n_matched:,} / {len(df):,} origins to block groups")

## Step 7: Save Results

In [None]:
# Select only the 6 required predictor columns plus original data
predictor_cols = [
    'Straight_Line_Distance_m',
    'Origin_Road_Length_Density_m_km2',
    'Dest_Intersection_Density_n_km2',
    'Slope_Pct',
    'Elevation_Difference_m',
    'Population'
]

# Drop temporary columns
cols_to_drop = ['buf_origin', 'buf_dest', 'Elevation_Origin_m', 'Elevation_Dest_m',
                'Origin_Intersection_Density_n_km2', 'Dest_Road_Length_Density_m_km2']
df_final = df.drop(columns=[c for c in cols_to_drop if c in df.columns], errors='ignore')

# Save to CSV
df_final.to_csv(OUTPUT_CSV, index=False)

print(f"\n" + "="*60)
print("PREDICTOR GENERATION COMPLETE")
print("="*60)
print(f"\nSaved to: {OUTPUT_CSV}")
print(f"Records: {len(df_final):,}")
print(f"\nPredictor Summary:")
print("-"*60)
for col in predictor_cols:
    if col in df_final.columns:
        print(f"{col:40} Mean: {df_final[col].mean():10.2f}")
print("-"*60)
print("\n✓ Ready for model prediction!")

In [None]:
# Download the file (Colab)
from google.colab import files
files.download(OUTPUT_CSV)