In [2]:
# ==========================================================
# Phase 3 ‚Äì Stacking Regressor + Residual Calibration ONLY
# + Over/Underfitting Diagnostics (Train vs Test)
# + REGULARIZATION to reduce overfitting
# ==========================================================

from math import sqrt
import numpy as np
import pandas as pd

from sklearn.ensemble import StackingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, KFold


# ---------------------------------------
# PRD-driven feature engineering
# ---------------------------------------
def add_prd_logic_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    df["spend_per_day"] = df["cost_per_day"]
    df["miles_per_day_safe"] = df["miles_traveled"] / df["trip_duration_days"].clip(lower=1)
    df["log_miles_traveled"] = np.log1p(df["miles_traveled"])

    df["receipt_cents"] = np.round(df["total_receipts_amount"] - np.floor(df["total_receipts_amount"]), 2)
    df["receipt_is_point_49_or_99"] = df["receipt_cents"].isin([0.49, 0.99]).astype(int)

    df["spend_per_day_good_band"] = df["spend_per_day"].between(75, 120).astype(int)
    df["spend_per_day_low"] = (df["spend_per_day"] < 50).astype(int)
    df["spend_per_day_high"] = (df["spend_per_day"] > 120).astype(int)

    df["receipts_near_700"] = df["total_receipts_amount"].between(600, 800).astype(int)
    df["receipts_very_low"] = (df["total_receipts_amount"] < 50).astype(int)
    df["receipts_very_high"] = (df["total_receipts_amount"] > 1000).astype(int)

    duration_mask = df["trip_duration_days"].between(4, 6)
    miles_mask = df["miles_per_day_safe"].between(180, 220)
    df["is_efficiency_sweet_spot"] = (duration_mask & miles_mask).astype(int)
    df["efficiency_score"] = df["miles_per_day_safe"] * df["is_efficiency_sweet_spot"]

    return df


# ---------------------------------------
# Match-rate helper
# ---------------------------------------
def prediction_match_report(y_true, y_pred, label):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    abs_diff = np.abs(y_pred - y_true)

    print(f"\n=====  {label} =====")
    print(f"Exact Match Rate (<= $0.01): {(abs_diff <= 0.01).mean():.4f}")
    print(f"Close Match Rate (<= $1.00): {(abs_diff <= 1.00).mean():.4f}")
    print(f"¬±$5 Accuracy: {(abs_diff <= 5.00).mean():.4f}")


# ---------------------------------------
# Over/Underfitting diagnostics helper
# ---------------------------------------
def fit_diagnostics(y_train_true, y_train_pred, y_test_true, y_test_pred, label="Model"):
    def _metrics(y_true, y_pred):
        y_true = np.asarray(y_true, dtype=float)
        y_pred = np.asarray(y_pred, dtype=float)
        mae = mean_absolute_error(y_true, y_pred)
        rmse = sqrt(mean_squared_error(y_true, y_pred))
        r2 = r2_score(y_true, y_pred)
        abs_diff = np.abs(y_pred - y_true)
        within1 = (abs_diff <= 1.0).mean()
        within5 = (abs_diff <= 5.0).mean()
        return mae, rmse, r2, within1, within5

    tr = _metrics(y_train_true, y_train_pred)
    te = _metrics(y_test_true, y_test_pred)

    print(f"\n=====  Fit Diagnostics: {label} =====")
    print("Train -> MAE: {:.2f} | RMSE: {:.2f} | R¬≤: {:.4f} | ‚â§$1: {:.3f} | ‚â§$5: {:.3f}".format(*tr))
    print("Test  -> MAE: {:.2f} | RMSE: {:.2f} | R¬≤: {:.4f} | ‚â§$1: {:.3f} | ‚â§$5: {:.3f}".format(*te))

    print("\n----- Generalization Gaps (Train - Test) -----")
    print(f"MAE gap:  {tr[0] - te[0]:.2f}")
    print(f"RMSE gap: {tr[1] - te[1]:.2f}")
    print(f"R¬≤ gap:   {tr[2] - te[2]:.4f}")
    print(f"‚â§$1 gap:  {tr[3] - te[3]:.3f}")
    print(f"‚â§$5 gap:  {tr[4] - te[4]:.3f}")

    if (tr[0] < te[0] - 20) or (tr[3] > te[3] + 0.10):
        print("\n Likely OVERFITTING: train is much better than test.")
    elif (tr[3] < 0.05 and te[3] < 0.05) and (tr[0] > 30 and te[0] > 30):
        print("\n Likely UNDERFITTING *for match-rate*: both train/test ‚â§$1 are low.")
    else:
        print("\n No strong overfitting signal from these thresholds.")


# ----------------------------------------------------------
# Load + feature engineer
# ----------------------------------------------------------
combined_df = pd.read_csv("phase2_features_baseline_models.csv")
print("Dataset loaded for Phase 3!")
print("Shape:", combined_df.shape)

combined_df = add_prd_logic_features(combined_df)
print("PRD-driven logic features added. New shape:", combined_df.shape)

features = [
    "trip_duration_days", "miles_traveled", "total_receipts_amount",
    "cost_per_day", "cost_per_mile", "miles_per_day", "cost_ratio",
    "spend_per_day", "miles_per_day_safe", "log_miles_traveled",
    "receipt_cents", "receipt_is_point_49_or_99",
    "spend_per_day_good_band", "spend_per_day_low", "spend_per_day_high",
    "receipts_near_700", "receipts_very_low", "receipts_very_high",
    "efficiency_score", "is_efficiency_sweet_spot",
]
target = "reimbursement"

missing = [c for c in (features + [target]) if c not in combined_df.columns]
if missing:
    raise KeyError(f"Missing required columns in dataset: {missing}")

X = combined_df[features].copy()
y = combined_df[target].copy()

X = X.replace([np.inf, -np.inf], np.nan)
if X.isna().any().any():
    X = X.fillna(X.median(numeric_only=True))

# ----------------------------------------------------------
# Split
# ----------------------------------------------------------
USE_SHUFFLE_SPLIT = True

if USE_SHUFFLE_SPLIT:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=42, shuffle=True
    )
    print("\n=====  Random Split Applied =====")
else:
    split = int(0.75 * len(X))
    X_train, X_test = X.iloc[:split], X.iloc[split:]
    y_train, y_test = y.iloc[:split], y.iloc[split:]
    print("\n=====  Sequential Split Applied =====")

print(f"Training Samples: {len(X_train)}")
print(f"Testing Samples:  {len(X_test)}")

# ==========================================================
# REGULARIZED Stacking Regressor (reduce overfitting)
# ==========================================================
base_models = [
    # Regularize the tree (stop perfect memorization)
    ("decision_tree", DecisionTreeRegressor(
        random_state=42,
        max_depth=6,
        min_samples_leaf=20,
        min_samples_split=40,
    )),

    # Regularize RF
    ("random_forest", RandomForestRegressor(
        n_estimators=400,
        random_state=42,
        n_jobs=-1,
        max_depth=16,
        min_samples_leaf=10,
        min_samples_split=20,
        max_features=0.7,
    )),

    # Stochastic GB + early stopping
    ("gradient_boosting", GradientBoostingRegressor(
        random_state=42,
        n_estimators=2000,
        learning_rate=0.03,
        max_depth=2,
        min_samples_leaf=20,
        subsample=0.7,              # <- reduces overfit
        validation_fraction=0.15,   # for early stopping
        n_iter_no_change=30,
        tol=1e-4,
    )),
]

# Ridge meta-learner shrinks combination weights (helps overfit)
meta_model = Ridge(alpha=1.0, random_state=42)

# CV inside stacker so meta-model learns from out-of-fold preds
cv = KFold(n_splits=5, shuffle=True, random_state=42)

stack_model = StackingRegressor(
    estimators=base_models,
    final_estimator=meta_model,
    passthrough=True,
    cv=cv,
    n_jobs=-1,
)

print("\n Training Ensemble Model (regularized)...")
stack_model.fit(X_train, y_train)

# Base predictions
stack_pred_test = stack_model.predict(X_test)
stack_pred_train = stack_model.predict(X_train)

print("\n Ensemble Model Performance (Stacking Regressor):")
print(f"MAE:  {mean_absolute_error(y_test, stack_pred_test):.4f}")
print(f"RMSE: {sqrt(mean_squared_error(y_test, stack_pred_test)):.4f}")
print(f"R¬≤:   {r2_score(y_test, stack_pred_test):.4f}")
prediction_match_report(y_test, stack_pred_test, "Base Ensemble Match Rates (Test)")

# ==========================================================
# REGULARIZED Residual Calibration (reduce overfitting)
# ==========================================================
train_residuals = y_train - stack_pred_train

residual_model = GradientBoostingRegressor(
    random_state=42,
    n_estimators=2000,
    learning_rate=0.03,
    max_depth=1,                 # <- shallower = less overfit
    min_samples_leaf=40,         # <- higher = smoother residuals
    subsample=0.7,               # <- big overfit reducer
    validation_fraction=0.15,
    n_iter_no_change=30,
    tol=1e-4,
)

residual_model.fit(X_train, train_residuals)
print("\n Residual correction model trained (regularized)!")

# Calibrated predictions
test_final_pred = stack_pred_test + residual_model.predict(X_test)
train_final_pred = stack_pred_train + residual_model.predict(X_train)

print("\n  Calibrated Ensemble Performance (with Residual Model):")
print(f"MAE:  {mean_absolute_error(y_test, test_final_pred):.4f}")
print(f"RMSE: {sqrt(mean_squared_error(y_test, test_final_pred)):.4f}")
print(f"R¬≤:   {r2_score(y_test, test_final_pred):.4f}")

prediction_match_report(y_test, test_final_pred, "Calibrated Ensemble Match Rates (Test ‚Äì Raw)")
prediction_match_report(y_test, np.round(test_final_pred, 2), "Calibrated Ensemble Match Rates (Test ‚Äì Rounded to Cents)")
prediction_match_report(y_train, train_final_pred, "Calibrated Ensemble Match Rates (Train)")

fit_diagnostics(
    y_train_true=y_train,
    y_train_pred=train_final_pred,
    y_test_true=y_test,
    y_test_pred=test_final_pred,
    label="Calibrated Ensemble + Residual (Regularized)"
)


Dataset loaded for Phase 3!
Shape: (1000, 9)
PRD-driven logic features added. New shape: (1000, 22)

===== ‚úÖ Random Split Applied =====
Training Samples: 750
Testing Samples:  250

 Training Ensemble Model (regularized)...

 Ensemble Model Performance (Stacking Regressor):
MAE:  67.0958
RMSE: 92.5184
R¬≤:   0.9591

=====  Base Ensemble Match Rates (Test) =====
Exact Match Rate (<= $0.01): 0.0000
Close Match Rate (<= $1.00): 0.0200
¬±$5 Accuracy: 0.0480

 Residual correction model trained (regularized)!

  Calibrated Ensemble Performance (with Residual Model):
MAE:  66.4409
RMSE: 92.0679
R¬≤:   0.9595

=====  Calibrated Ensemble Match Rates (Test ‚Äì Raw) =====
Exact Match Rate (<= $0.01): 0.0000
Close Match Rate (<= $1.00): 0.0080
¬±$5 Accuracy: 0.0680

=====  Calibrated Ensemble Match Rates (Test ‚Äì Rounded to Cents) =====
Exact Match Rate (<= $0.01): 0.0000
Close Match Rate (<= $1.00): 0.0080
¬±$5 Accuracy: 0.0680

=====  Calibrated Ensemble Match Rates (Train) =====
Exact Match R