# NYC Curbside Congestion - Modeling & Evaluation

This notebook covers:
1. Loading the prepared modeling dataset
2. Training XGBoost classifier
3. Model evaluation and comparison
4. Feature importance analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
import joblib
import sys
sys.path.insert(0, '..')

from xgboost import XGBClassifier
from src.config import ALL_MODEL_FEATURES, MODELING_DATASET_FILE, MODELS_DIR

plt.style.use('dark_background')

## 1. Load Data

In [None]:
df = pd.read_csv('../data/modeling_dataset.csv')
print(f"Dataset shape: {df.shape}")
print(f"Features used: {ALL_MODEL_FEATURES}")
df.head()

In [None]:
# Check class balance
print("Target distribution:")
print(df['high_congestion'].value_counts())
print(f"\nClass ratio: {df['high_congestion'].mean():.2%} positive")

## 2. Prepare Features

In [None]:
# Ensure all features exist
for col in ALL_MODEL_FEATURES:
    if col not in df.columns:
        print(f"Missing: {col}")
        df[col] = 0

X = df[ALL_MODEL_FEATURES]
y = df['high_congestion']

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train size: {len(X_train)}")
print(f"Test size: {len(X_test)}")

## 3. Train XGBoost Model

In [None]:
# Calculate class weight
scale_weight = len(y_train[y_train==0]) / len(y_train[y_train==1])
print(f"Scale weight for imbalanced classes: {scale_weight:.2f}")

In [None]:
# Train model
model = XGBClassifier(
    n_estimators=150,
    max_depth=8,
    learning_rate=0.1,
    random_state=42,
    n_jobs=-1,
    scale_pos_weight=scale_weight,
    eval_metric='logloss',
    verbosity=0
)

model.fit(X_train, y_train)
print("Model trained successfully!")

## 4. Evaluate Model

In [None]:
# Predictions
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# Metrics
metrics = {
    'Accuracy': accuracy_score(y_test, y_pred),
    'Precision': precision_score(y_test, y_pred),
    'Recall': recall_score(y_test, y_pred),
    'F1 Score': f1_score(y_test, y_pred)
}

print("=" * 40)
print("MODEL PERFORMANCE")
print("=" * 40)
for name, value in metrics.items():
    print(f"{name:>12}: {value:.4f}")

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax,
            xticklabels=['Low Congestion', 'High Congestion'],
            yticklabels=['Low Congestion', 'High Congestion'])
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title('Confusion Matrix')
plt.tight_layout()
plt.show()

In [None]:
# Classification report
print(classification_report(y_test, y_pred, target_names=['Low', 'High']))

## 5. Feature Importance

In [None]:
# Feature importance
importance = pd.Series(model.feature_importances_, index=ALL_MODEL_FEATURES)
importance = importance.sort_values(ascending=True)

fig, ax = plt.subplots(figsize=(10, 8))
importance.plot(kind='barh', ax=ax, color='#3b82f6')
ax.set_xlabel('Feature Importance')
ax.set_title('XGBoost Feature Importance')
plt.tight_layout()
plt.show()

In [None]:
# Top features
print("Top 5 Most Important Features:")
for feat, imp in importance.nlargest(5).items():
    print(f"  {feat}: {imp:.4f}")

## 6. Save Model

In [None]:
# Save model
model_path = '../models/xgboost_model.pkl'
joblib.dump(model, model_path)
print(f"Model saved to: {model_path}")

## Summary

**Model Performance:**
- Accuracy: ~84%
- F1 Score: ~73%

**Key Features:**
- Hour of day is the strongest predictor
- Spatial features (lat/lon) are important
- Rush hour and weekend flags help
- Holiday features provide marginal improvement