# Train Routing Disagreement Models

This notebook trains Random Forest models to predict pedestrian routing disagreements.

In [None]:
# Install dependencies (uncomment if needed)
# !pip install scikit-learn pandas numpy matplotlib seaborn joblib

In [None]:
# Configuration
INPUT_CSV = 'your_data_with_routing.csv'
MODEL_OUTPUT_DIR = 'models/'

# Feature columns
FEATURE_COLS = [
    'Straight_Line_Distance_m',
    'Origin_Road_Length_Density_m_km2',
    'Dest_Intersection_Density_n_km2',
    'Slope_Pct',
    'Elevation_Difference_m',
    'Population'
]

# Routing columns (update to match your data)
DISTANCE_COLS = ['ORS_Dist_m', 'Arc_Dist_m', 'GMaps_Dist_m']
TIME_COLS = ['ORS_Time_min', 'Arc_Time_min', 'GMaps_Time_min']

# Thresholds
DISTANCE_THRESHOLD = 5
TIME_THRESHOLD = 20

RANDOM_STATE = 42
TEST_SIZE = 0.20

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, confusion_matrix, roc_curve
import matplotlib.pyplot as plt
import joblib
import warnings
warnings.filterwarnings('ignore')

os.makedirs(MODEL_OUTPUT_DIR, exist_ok=True)

In [None]:
# Load data
df = pd.read_csv(INPUT_CSV)
print(f"Loaded {len(df)} records")

In [None]:
# Calculate disagreement
def calculate_disagreement(df, cols):
    data = df[cols]
    min_vals = data.min(axis=1)
    max_vals = data.max(axis=1)
    return ((max_vals - min_vals) / min_vals) * 100

# Prepare distance model data
complete_dist = df[FEATURE_COLS + DISTANCE_COLS].dropna()
disagreement_dist = calculate_disagreement(complete_dist, DISTANCE_COLS)
y_dist = (disagreement_dist >= DISTANCE_THRESHOLD).astype(int)
X_dist = complete_dist[FEATURE_COLS]

# Prepare time model data
complete_time = df[FEATURE_COLS + TIME_COLS].dropna()
disagreement_time = calculate_disagreement(complete_time, TIME_COLS)
y_time = (disagreement_time >= TIME_THRESHOLD).astype(int)
X_time = complete_time[FEATURE_COLS]

print(f"Distance model - Agreement: {(y_dist==0).sum()}, Disagreement: {(y_dist==1).sum()}")
print(f"Time model - Agreement: {(y_time==0).sum()}, Disagreement: {(y_time==1).sum()}")

In [None]:
# Train/test split
X_train_dist, X_test_dist, y_train_dist, y_test_dist = train_test_split(
    X_dist, y_dist, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y_dist)

X_train_time, X_test_time, y_train_time, y_test_time = train_test_split(
    X_time, y_time, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y_time)

In [None]:
# Train distance model
distance_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=12,
    max_features='sqrt',
    min_samples_leaf=3,
    min_samples_split=10,
    class_weight='balanced',
    random_state=RANDOM_STATE,
    n_jobs=-1
)
distance_model.fit(X_train_dist, y_train_dist)

# Train time model
time_model = RandomForestClassifier(
    n_estimators=450,
    max_depth=12,
    max_features='sqrt',
    min_samples_leaf=3,
    min_samples_split=10,
    class_weight='balanced',
    random_state=RANDOM_STATE,
    n_jobs=-1
)
time_model.fit(X_train_time, y_train_time)

print("Models trained")

In [None]:
# Evaluate
def evaluate(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    return {
        'AUC': roc_auc_score(y_test, y_proba) * 100,
        'Accuracy': (tp + tn) / len(y_test) * 100,
        'Sensitivity': tp / (tp + fn) * 100,
        'Specificity': tn / (tn + fp) * 100
    }

dist_metrics = evaluate(distance_model, X_test_dist, y_test_dist)
time_metrics = evaluate(time_model, X_test_time, y_test_time)

print(f"Distance Model - AUC: {dist_metrics['AUC']:.1f}%, Accuracy: {dist_metrics['Accuracy']:.1f}%")
print(f"Time Model - AUC: {time_metrics['AUC']:.1f}%, Accuracy: {time_metrics['Accuracy']:.1f}%")

In [None]:
# Save models
joblib.dump(distance_model, f'{MODEL_OUTPUT_DIR}distance_model.joblib')
joblib.dump(time_model, f'{MODEL_OUTPUT_DIR}time_model.joblib')
print(f"Models saved to {MODEL_OUTPUT_DIR}")