# Train Routing Disagreement Prediction Models

This notebook trains Random Forest models to predict pedestrian routing disagreements among Google Maps, ArcGIS, and OpenRouteService.

**Use this notebook if you want to:**
- Train models on your own study area
- Modify classification thresholds
- Experiment with different features or algorithms

**If you just want to use the pre-trained models, use `03_apply_pretrained_models.ipynb` instead.**

## Requirements
- Dataset with routing estimates from all 3 platforms (ORS, ArcGIS, Google Maps)
- The 6 predictor features (generated using `01_generate_predictors.ipynb`)

## Step 0: Install Dependencies

In [None]:
!pip install scikit-learn pandas numpy matplotlib seaborn joblib -q
print("✓ Dependencies installed")

## Step 1: Configuration

In [None]:
#@title Configuration Settings { display-mode: "form" }

# ============================================================================
# FILE PATHS
# ============================================================================
INPUT_CSV = '/content/your_data_with_routing.csv'  #@param {type:"string"}
MODEL_OUTPUT_DIR = '/content/models/'  #@param {type:"string"}

# ============================================================================
# FEATURE COLUMNS - The 6 predictors
# ============================================================================
FEATURE_COLS = [
    'Straight_Line_Distance_m',
    'Origin_Road_Length_Density_m_km2',
    'Dest_Intersection_Density_n_km2',
    'Slope_Pct',
    'Elevation_Difference_m',
    'Population'
]

# ============================================================================
# ROUTING COLUMNS - Update to match your column names
# ============================================================================
# Distance columns from each platform
DISTANCE_COLS = ['ORS_Dist_m', 'Arc_Dist_m', 'GMaps_Dist_m']

# Time columns from each platform
TIME_COLS = ['ORS_Time_min', 'Arc_Time_min', 'GMaps_Time_min']

# ============================================================================
# CLASSIFICATION THRESHOLDS
# ============================================================================
DISTANCE_THRESHOLD = 5   #@param {type:"integer"}
TIME_THRESHOLD = 20      #@param {type:"integer"}

# ============================================================================
# MODEL PARAMETERS
# ============================================================================
RANDOM_STATE = 42
TEST_SIZE = 0.20

import os
os.makedirs(MODEL_OUTPUT_DIR, exist_ok=True)

print("Configuration loaded!")
print(f"  Distance threshold: {DISTANCE_THRESHOLD}%")
print(f"  Time threshold: {TIME_THRESHOLD}%")

## Step 2: Load and Prepare Data

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import (roc_auc_score, accuracy_score, f1_score,
                              precision_score, recall_score, confusion_matrix,
                              roc_curve, classification_report)
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings
warnings.filterwarnings('ignore')

print("✓ Libraries imported")

In [None]:
# Load data
df = pd.read_csv(INPUT_CSV)
print(f"✓ Loaded {len(df):,} records")

# Verify columns
all_required = FEATURE_COLS + DISTANCE_COLS + TIME_COLS
missing = [c for c in all_required if c not in df.columns]
if missing:
    print(f"\n⚠️ Missing columns: {missing}")
    print(f"Available columns: {list(df.columns)}")
else:
    print("✓ All required columns found")

In [None]:
# Calculate disagreement ratios
def calculate_disagreement(df, cols):
    """Calculate max disagreement ratio between platforms"""
    data = df[cols]
    min_vals = data.min(axis=1)
    max_vals = data.max(axis=1)
    return ((max_vals - min_vals) / min_vals) * 100

# Prepare distance model data
complete_dist = df[FEATURE_COLS + DISTANCE_COLS].dropna()
disagreement_dist = calculate_disagreement(complete_dist, DISTANCE_COLS)
y_dist = (disagreement_dist >= DISTANCE_THRESHOLD).astype(int)
X_dist = complete_dist[FEATURE_COLS]

print(f"\nDISTANCE MODEL (threshold: {DISTANCE_THRESHOLD}%)")
print(f"  Complete cases: {len(X_dist):,}")
print(f"  Agreement (0): {(y_dist==0).sum():,} ({(y_dist==0).mean():.1%})")
print(f"  Disagreement (1): {(y_dist==1).sum():,} ({(y_dist==1).mean():.1%})")

# Prepare time model data
complete_time = df[FEATURE_COLS + TIME_COLS].dropna()
disagreement_time = calculate_disagreement(complete_time, TIME_COLS)
y_time = (disagreement_time >= TIME_THRESHOLD).astype(int)
X_time = complete_time[FEATURE_COLS]

print(f"\nTIME MODEL (threshold: {TIME_THRESHOLD}%)")
print(f"  Complete cases: {len(X_time):,}")
print(f"  Agreement (0): {(y_time==0).sum():,} ({(y_time==0).mean():.1%})")
print(f"  Disagreement (1): {(y_time==1).sum():,} ({(y_time==1).mean():.1%})")

## Step 3: Train/Test Split

In [None]:
# Stratified train/test split
X_train_dist, X_test_dist, y_train_dist, y_test_dist = train_test_split(
    X_dist, y_dist,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=y_dist
)

X_train_time, X_test_time, y_train_time, y_test_time = train_test_split(
    X_time, y_time,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=y_time
)

print(f"Distance Model: Train={len(X_train_dist):,}, Test={len(X_test_dist):,}")
print(f"Time Model: Train={len(X_train_time):,}, Test={len(X_test_time):,}")

## Step 4: Train Models

In [None]:
# Distance model parameters
distance_params = {
    'n_estimators': 300,
    'class_weight': 'balanced',
    'max_depth': 12,
    'max_features': 'sqrt',
    'min_samples_leaf': 3,
    'min_samples_split': 10,
    'random_state': RANDOM_STATE,
    'n_jobs': -1
}

# Time model parameters
time_params = {
    'n_estimators': 450,
    'class_weight': 'balanced',
    'max_depth': 12,
    'max_features': 'sqrt',
    'min_samples_leaf': 3,
    'min_samples_split': 10,
    'random_state': RANDOM_STATE,
    'n_jobs': -1
}

print("Training distance model...")
distance_model = RandomForestClassifier(**distance_params)
distance_model.fit(X_train_dist, y_train_dist)
print(f"✓ Distance model trained ({distance_params['n_estimators']} trees)")

print("\nTraining time model...")
time_model = RandomForestClassifier(**time_params)
time_model.fit(X_train_time, y_train_time)
print(f"✓ Time model trained ({time_params['n_estimators']} trees)")

## Step 5: Evaluate Models

In [None]:
def evaluate_model(model, X_test, y_test, model_name):
    """Comprehensive model evaluation"""
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    # Metrics
    metrics = {
        'TP': tp,
        'TN': tn,
        'FP': fp,
        'FN': fn,
        'Sensitivity': tp / (tp + fn) * 100,
        'Specificity': tn / (tn + fp) * 100,
        'Precision': tp / (tp + fp) * 100,
        'Accuracy': (tp + tn) / len(y_test) * 100,
        'F1-Score': f1_score(y_test, y_pred) * 100,
        'AUC': roc_auc_score(y_test, y_proba) * 100
    }
    
    print(f"\n{'='*50}")
    print(f"{model_name} RESULTS")
    print(f"{'='*50}")
    print(f"\nConfusion Matrix:")
    print(f"  TP: {tp:,}  FP: {fp:,}")
    print(f"  FN: {fn:,}  TN: {tn:,}")
    print(f"\nMetrics:")
    print(f"  AUC:         {metrics['AUC']:.1f}%")
    print(f"  Accuracy:    {metrics['Accuracy']:.1f}%")
    print(f"  Sensitivity: {metrics['Sensitivity']:.1f}%")
    print(f"  Specificity: {metrics['Specificity']:.1f}%")
    print(f"  Precision:   {metrics['Precision']:.1f}%")
    print(f"  F1-Score:    {metrics['F1-Score']:.1f}%")
    
    return metrics, y_proba

# Evaluate both models
dist_metrics, dist_proba = evaluate_model(distance_model, X_test_dist, y_test_dist, "DISTANCE MODEL")
time_metrics, time_proba = evaluate_model(time_model, X_test_time, y_test_time, "TIME MODEL")

## Step 6: Visualize Results

In [None]:
# Plot ROC Curves
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Distance ROC
fpr_dist, tpr_dist, _ = roc_curve(y_test_dist, dist_proba)
axes[0].plot(fpr_dist, tpr_dist, 'b-', linewidth=2, 
             label=f'Model (AUC = {dist_metrics["AUC"]:.1f}%)')
axes[0].plot([0, 1], [0, 1], 'k--', alpha=0.5, label='Random')
axes[0].set_xlabel('False Positive Rate')
axes[0].set_ylabel('True Positive Rate')
axes[0].set_title('Distance Model ROC Curve')
axes[0].legend(loc='lower right')
axes[0].grid(alpha=0.3)

# Time ROC
fpr_time, tpr_time, _ = roc_curve(y_test_time, time_proba)
axes[1].plot(fpr_time, tpr_time, 'orange', linewidth=2,
             label=f'Model (AUC = {time_metrics["AUC"]:.1f}%)')
axes[1].plot([0, 1], [0, 1], 'k--', alpha=0.5, label='Random')
axes[1].set_xlabel('False Positive Rate')
axes[1].set_ylabel('True Positive Rate')
axes[1].set_title('Time Model ROC Curve')
axes[1].legend(loc='lower right')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig(f'{MODEL_OUTPUT_DIR}roc_curves.png', dpi=300, bbox_inches='tight')
plt.show()
print("✓ ROC curves saved")

In [None]:
# Feature Importance
feature_names = [
    'Straight-line Distance',
    'Road Length Density (Origin)',
    'Intersection Density (Dest)',
    'Slope (%)',
    'Elevation Difference',
    'Population'
]

fig, ax = plt.subplots(figsize=(10, 6))

x = np.arange(len(feature_names))
width = 0.35

bars1 = ax.bar(x - width/2, distance_model.feature_importances_, width,
               label='Distance Model', color='steelblue', alpha=0.8)
bars2 = ax.bar(x + width/2, time_model.feature_importances_, width,
               label='Time Model', color='orange', alpha=0.8)

ax.set_ylabel('Feature Importance')
ax.set_title('Feature Importance Comparison')
ax.set_xticks(x)
ax.set_xticklabels(feature_names, rotation=45, ha='right')
ax.legend()
ax.grid(alpha=0.3, axis='y')

# Add value labels
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height + 0.005,
                f'{height:.3f}', ha='center', va='bottom', fontsize=8)

plt.tight_layout()
plt.savefig(f'{MODEL_OUTPUT_DIR}feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()
print("✓ Feature importance plot saved")

## Step 7: Save Models

In [None]:
# Save trained models
distance_model_path = f'{MODEL_OUTPUT_DIR}distance_model.joblib'
time_model_path = f'{MODEL_OUTPUT_DIR}time_model.joblib'

joblib.dump(distance_model, distance_model_path)
joblib.dump(time_model, time_model_path)

print(f"\n{'='*60}")
print("MODELS SAVED")
print(f"{'='*60}")
print(f"\nDistance model: {distance_model_path}")
print(f"Time model: {time_model_path}")
print(f"\nModel Summary:")
print(f"  Distance Model - AUC: {dist_metrics['AUC']:.1f}%, Accuracy: {dist_metrics['Accuracy']:.1f}%")
print(f"  Time Model - AUC: {time_metrics['AUC']:.1f}%, Accuracy: {time_metrics['Accuracy']:.1f}%")

In [None]:
# Download models (Colab)
from google.colab import files

print("Downloading models...")
files.download(distance_model_path)
files.download(time_model_path)
print("✓ Models downloaded")

## Summary Table (for paper)

In [None]:
# Create summary table
summary_data = {
    'Metric': ['True Positives (TP)', 'True Negatives (TN)', 'False Positives (FP)',
               'False Negatives (FN)', 'Sensitivity (%)', 'Specificity (%)',
               'Precision (%)', 'Accuracy (%)', 'F1-Score (%)', 'AUC (%)'],
    'Distance Model': [
        dist_metrics['TP'], dist_metrics['TN'], dist_metrics['FP'], dist_metrics['FN'],
        f"{dist_metrics['Sensitivity']:.1f}", f"{dist_metrics['Specificity']:.1f}",
        f"{dist_metrics['Precision']:.1f}", f"{dist_metrics['Accuracy']:.1f}",
        f"{dist_metrics['F1-Score']:.1f}", f"{dist_metrics['AUC']:.1f}"
    ],
    'Time Model': [
        time_metrics['TP'], time_metrics['TN'], time_metrics['FP'], time_metrics['FN'],
        f"{time_metrics['Sensitivity']:.1f}", f"{time_metrics['Specificity']:.1f}",
        f"{time_metrics['Precision']:.1f}", f"{time_metrics['Accuracy']:.1f}",
        f"{time_metrics['F1-Score']:.1f}", f"{time_metrics['AUC']:.1f}"
    ]
}

summary_df = pd.DataFrame(summary_data)
print("\nTable 2: Statistical Validation Indices")
print("="*60)
print(summary_df.to_string(index=False))

# Save table
summary_df.to_csv(f'{MODEL_OUTPUT_DIR}validation_metrics.csv', index=False)
print(f"\n✓ Table saved to {MODEL_OUTPUT_DIR}validation_metrics.csv")