In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

import plotly.express as px
import plotly.io as pio
from prophet import Prophet
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sqlalchemy import create_engine
import logging
import os
from dotenv import load_dotenv

load_dotenv()
db_url = os.getenv("DATABASE_URL")
logging.getLogger("cmdstanpy").setLevel(logging.WARNING)
pd.set_option("display.float_format", "{:.2f}".format)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Connect to your Postgres database
engine = create_engine(db_url)

# Load the table into a DataFrame
df = pd.read_sql("SELECT * FROM houston_311", engine)

pio.renderers.default = "notebook_connected"

In [3]:
df.describe

<bound method NDFrame.describe of                    CASE NUMBER                                 NEIGHBORHOOD  \
0        12091834-101002444724                 EAST LITTLE YORK / HOMESTEAD   
1        12091835-101002444725                          NORTHSIDE/NORTHLINE   
2                 101002444726                                     MID WEST   
3                 101002444727  WASHINGTON AVENUE COALITION / MEMORIAL PARK   
4        12091836-101002444730                               GREATER UPTOWN   
...                        ...                                          ...   
3800664      217348-2400490914                                     WESTBURY   
3800665    12670427-2400490912                    GREATER OST / SOUTH UNION   
3800666             2400490911                                        ALIEF   
3800667    20480340-2400490910                                   NORTHSHORE   
3800668    12670426-2400490909                          NORTHSIDE/NORTHLINE   

                 

In [None]:
df.dtypes

In [5]:
df[df["CREATED DATE"] > pd.Timestamp.today()]

Unnamed: 0,CASE NUMBER,NEIGHBORHOOD,DEPARTMENT,DIVISION,CASE TYPE,CREATED DATE,CLOSED DATE,LATITUDE,LONGITUDE,CATEGORY,RESOLUTION_TIME_DAYS


In [None]:
cols = ["NEIGHBORHOOD", "DEPARTMENT", "DIVISION", "CATEGORY", "CASE TYPE"]

for c in cols:
    df[c] = (
        df[c]
        .astype(str)
        .str.strip()
        .str.title()
        .str.replace("\s+", " ", regex=True)
    )

In [None]:
# Aggregate by month
overall_ts = (
    df.groupby(pd.Grouper(key='CREATED DATE', freq='ME'))
      .size()
      .reset_index(name='y')
      .rename(columns={'CREATED DATE': 'ds'})
)

# Remove months with too few rows
cleaned_ts = overall_ts[overall_ts["y"] > 1000] 

# Determine the last fully complete month
last_month = cleaned_ts['ds'].max()
cleaned_ts = cleaned_ts[cleaned_ts['ds'] < last_month]

# Fit Prophet
m = Prophet(interval_width=0.95, daily_seasonality=True)
m.fit(cleaned_ts)

# Forecast next 12 months
future = m.make_future_dataframe(periods=12, freq='ME')
forecast = m.predict(future)

In [None]:
plot1 = m.plot(forecast)

In [None]:
plot2 = m.plot_components(forecast)

In [None]:
# Merge forecasted values with actuals
results = cleaned_ts.merge(forecast, on='ds', how='left')

# Keep only training portion (historical)
historical = results[results['ds'] <= cleaned_ts['ds'].max()]

In [None]:
# Calculate error metrics
y_true = historical['y']
y_pred = historical['yhat']

mae = mean_absolute_error(y_true, y_pred)
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100

print(f"MAE:  {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"MAPE: {mape:.2f}%")

In [None]:
# Keep only periods that exist in your original data
actual = cleaned_ts.set_index("ds")
predicted = forecast.set_index("ds")[["yhat", "yhat_lower", "yhat_upper"]]

# Join actuals with predictions
compare = actual.join(predicted, how="left")

# Compute residuals (errors)
compare["error"] = compare["y"] - compare["yhat"]
compare.head()

In [None]:
bias = compare["error"].mean()
print("Mean Bias:", bias)

In [None]:
compare["month"] = compare.index.month
monthly_bias = compare.groupby("month")["error"].mean()
print(monthly_bias)

In [None]:
category_col = "NEIGHBORHOOD"
date_col = "CREATED DATE"  

results = []

for cat, group in df.groupby(category_col, observed=True):
    ts = (
        group.groupby(pd.Grouper(key=date_col, freq="ME"))
             .size()
             .reset_index(name="y")
             .rename(columns={date_col: "ds"})
             .sort_values("ds")
    )
    
    # Exclude last incomplete month
    if not ts.empty:
        last_month = ts['ds'].max()
        ts = ts[ts['ds'] < last_month]
    
    # Skip if too few data points
    if len(ts) < 24:
        continue

    # Train-test split
    train = ts[:-6]
    test = ts[-6:]
    
    # Fit Prophet
    m = Prophet()
    m.fit(train)
    
    # Make future and predict for test dates
    future = m.make_future_dataframe(periods=6, freq="ME")
    forecast = m.predict(future)
    forecast_test = forecast.tail(6).copy()
    forecast_test["y_true"] = test["y"].values
    
    # Compute metrics
    mae = mean_absolute_error(forecast_test["y_true"], forecast_test["yhat"])
    rmse = np.sqrt(mean_squared_error(forecast_test["y_true"], forecast_test["yhat"]))
    mape = np.mean(np.abs((forecast_test["y_true"] - forecast_test["yhat"]) / forecast_test["y_true"])) * 100
    
    results.append({
        "Category": cat,
        "MAE": mae,
        "RMSE": rmse,
        "MAPE (%)": mape
    })

# Create summary DataFrame
results_df = pd.DataFrame(results).sort_values("MAPE (%)")
pd.set_option('display.max_rows', None)
print(results_df)

In [None]:
category_col = "DEPARTMENT" 
date_col = "CREATED DATE"     

results = []

for cat, group in df.groupby(category_col, observed=True):
    ts = (
        group.groupby(pd.Grouper(key=date_col, freq="ME"))
             .size()
             .reset_index(name="y")
             .rename(columns={date_col: "ds"})
             .sort_values("ds")
    )
    
    # Exclude last incomplete month
    if not ts.empty:
        last_month = ts['ds'].max()
        ts = ts[ts['ds'] < last_month]
    
    # Skip if too few data points
    if len(ts) < 24:
        continue

    # Train-test split
    train = ts[:-6]
    test = ts[-6:]
    
    # Fit Prophet
    m = Prophet()
    m.fit(train)
    
    # Make future and predict for test dates
    future = m.make_future_dataframe(periods=6, freq="ME")
    forecast = m.predict(future)
    forecast_test = forecast.tail(6).copy()
    forecast_test["y_true"] = test["y"].values
    
    # Compute metrics
    mae = mean_absolute_error(forecast_test["y_true"], forecast_test["yhat"])
    rmse = np.sqrt(mean_squared_error(forecast_test["y_true"], forecast_test["yhat"]))
    mape = np.mean(np.abs((forecast_test["y_true"] - forecast_test["yhat"]) / forecast_test["y_true"])) * 100
    
    results.append({
        "Category": cat,
        "MAE": mae,
        "RMSE": rmse,
        "MAPE (%)": mape
    })

# Create summary DataFrame
results_df = pd.DataFrame(results).sort_values("MAPE (%)")
pd.set_option('display.max_rows', None)
print(results_df)

In [None]:
category_col = "DIVISION" 
date_col = "CREATED DATE"     

results = []

for cat, group in df.groupby(category_col, observed=True):
    ts = (
        group.groupby(pd.Grouper(key=date_col, freq="ME"))
             .size()
             .reset_index(name="y")
             .rename(columns={date_col: "ds"})
             .sort_values("ds")
    )
    
    # Exclude last incomplete month
    if not ts.empty:
        last_month = ts['ds'].max()
        ts = ts[ts['ds'] < last_month]
    
    # Skip if too few data points
    if len(ts) < 24:
        continue

    # Train-test split
    train = ts[:-6]
    test = ts[-6:]
    
    # Fit Prophet
    m = Prophet()
    m.fit(train)
    
    # Make future and predict for test dates
    future = m.make_future_dataframe(periods=6, freq="ME")
    forecast = m.predict(future)
    forecast_test = forecast.tail(6).copy()
    forecast_test["y_true"] = test["y"].values
    
    # Compute metrics
    mae = mean_absolute_error(forecast_test["y_true"], forecast_test["yhat"])
    rmse = np.sqrt(mean_squared_error(forecast_test["y_true"], forecast_test["yhat"]))
    mape = np.mean(np.abs((forecast_test["y_true"] - forecast_test["yhat"]) / forecast_test["y_true"])) * 100
    
    results.append({
        "Category": cat,
        "MAE": mae,
        "RMSE": rmse,
        "MAPE (%)": mape
    })

# Create summary DataFrame
results_df = pd.DataFrame(results).sort_values("MAPE (%)")
pd.set_option('display.max_rows', None)  # Show all rows
print(results_df)

In [None]:
category_col = "CATEGORY" 
date_col = "CREATED DATE"     

results = []

for cat, group in df.groupby(category_col, observed=True):
    ts = (
        group.groupby(pd.Grouper(key=date_col, freq="ME"))
             .size()
             .reset_index(name="y")
             .rename(columns={date_col: "ds"})
             .sort_values("ds")
    )
    
    # Exclude last incomplete month
    if not ts.empty:
        last_month = ts['ds'].max()
        ts = ts[ts['ds'] < last_month]
    
    # Skip if too few data points
    if len(ts) < 24:
        continue

    # Train-test split
    train = ts[:-6]
    test = ts[-6:]
    
    # Fit Prophet
    m = Prophet()
    m.fit(train)
    
    # Make future and predict for test dates
    future = m.make_future_dataframe(periods=6, freq="ME")
    forecast = m.predict(future)
    forecast_test = forecast.tail(6).copy()
    forecast_test["y_true"] = test["y"].values
    
    # Compute metrics
    mae = mean_absolute_error(forecast_test["y_true"], forecast_test["yhat"])
    rmse = np.sqrt(mean_squared_error(forecast_test["y_true"], forecast_test["yhat"]))
    mape = np.mean(np.abs((forecast_test["y_true"] - forecast_test["yhat"]) / forecast_test["y_true"])) * 100
    
    results.append({
        "Category": cat,
        "MAE": mae,
        "RMSE": rmse,
        "MAPE (%)": mape
    })

# Create summary DataFrame
results_df = pd.DataFrame(results).sort_values("MAPE (%)")
pd.set_option('display.max_rows', None)  # Show all rows
print(results_df)

In [None]:
cap = df["RESOLUTION_TIME_DAYS"].quantile(0.95)
df["RESOLUTION_TIME_DAYS"] = df["RESOLUTION_TIME_DAYS"].clip(upper=cap)

In [None]:
# Find the two most recent months in your data
latest_date = df["CREATED DATE"].max()
latest_month_start = latest_date.replace(day=1)
second_latest_month_start = (latest_month_start - pd.DateOffset(months=1))

# Keep only rows before these two months
df_model = df[df["CREATED DATE"] < second_latest_month_start].copy()


In [None]:
# Median per case type (can contain NaN)
medians_by_type = df_model.groupby('CASE TYPE')['RESOLUTION_TIME_DAYS'].median()

# Map case-type median to rows
df_model['MEDIAN_BY_TYPE'] = df_model['CASE TYPE'].map(medians_by_type)

# Fill missing durations ONLY where case-type median exists
df_model['RESOLUTION_TIME_DAYS'] = df_model.apply(
    lambda row: row['MEDIAN_BY_TYPE'] if pd.isna(row['RESOLUTION_TIME_DAYS']) and not pd.isna(row['MEDIAN_BY_TYPE'])
    else row['RESOLUTION_TIME_DAYS'],
    axis=1
)

# Drop rows where RESOLUTION_TIME_DAYS is still NaN AND the case type median was also NaN
df_model = df_model.dropna(subset=['RESOLUTION_TIME_DAYS'])

In [None]:
# Case volume per month
monthly_volume = (
    df_model
    .groupby(pd.Grouper(key="CREATED DATE", freq="ME"))
    .size()
    .reset_index(name="case_volume")
)

# Dominant case-type proportion
case_type_monthly = (
    df_model
    .groupby([pd.Grouper(key="CREATED DATE", freq="ME"), "CASE TYPE"])
    .size()
    .reset_index(name="count")
)

month_totals = case_type_monthly.groupby("CREATED DATE")["count"].sum()
case_type_monthly["prop"] = case_type_monthly["count"] / case_type_monthly["CREATED DATE"].map(month_totals)

dominant_prop = (
    case_type_monthly
    .sort_values(["CREATED DATE", "prop"], ascending=[True, False])
    .groupby("CREATED DATE")
    .first()
    .reset_index()[["CREATED DATE", "prop"]]
    .rename(columns={"prop": "dominant_case_prop"})
)


# BUILD FORECASTING TS (y + regressors)
ts = (
    df_model
    .groupby(pd.Grouper(key="CREATED DATE", freq="ME"))["RESOLUTION_TIME_DAYS"]
    .mean()
    .reset_index()
    .rename(columns={"CREATED DATE": "ds", "RESOLUTION_TIME_DAYS": "y"})
)

# Merge regressors
ts = (
    ts
    .merge(monthly_volume.rename(columns={"CREATED DATE": "ds"}), on="ds", how="left")
    .merge(dominant_prop.rename(columns={"CREATED DATE": "ds"}), on="ds", how="left")
)


# INTERPOLATE MISSING MONTHS (critical for regressors)
full_range = pd.date_range(ts["ds"].min(), ts["ds"].max(), freq="ME")

ts = ts.set_index("ds").reindex(full_range)

for col in ["y", "case_volume", "dominant_case_prop"]:
    ts[col] = ts[col].interpolate().bfill().ffill()

ts = ts.rename_axis("ds").reset_index()


# TRAIN/TEST SPLIT
ts = ts[ts["ds"] < pd.Timestamp("2025-11-01")]
train = ts[ts['ds'] >= '2022-01-01']
# train = ts.iloc[:-6]
test = ts.iloc[-6:]


# PROPHET WITH REGRESSORS

model = Prophet(
    yearly_seasonality=True,
    changepoint_prior_scale=0.8,
)

model.add_regressor("case_volume")
model.add_regressor("dominant_case_prop")

model.fit(train)

# Build future df with regressors\
future = model.make_future_dataframe(periods=12, freq="ME")
future = future.merge(ts[["ds", "case_volume", "dominant_case_prop"]],
                      on="ds", how="left")

# Forward-fill for future months
future["case_volume"] = future["case_volume"].ffill()
future["dominant_case_prop"] = future["dominant_case_prop"].ffill()

forecast = model.predict(future)

pred = forecast[["ds", "yhat"]].merge(test[["ds", "y"]], on="ds", how="inner")

mape = mean_absolute_percentage_error(pred["y"], pred["yhat"]) * 100
print(f"MAPE: {mape:.2f}%")

# Plot
fig = px.line(train, x='ds', y='y', title='Resolution Time Forecast', labels={'y':'Resolution Days'})
fig.add_scatter(x=test['ds'], y=test['y'], mode='lines+markers', name='Actual')

# Plot predictions on test set (if desired)
fig.add_scatter(x=pred['ds'], y=pred['yhat'], mode='lines', name='Forecast (Test)')

# Plot future predictions
future_forecast = forecast[forecast['ds'] > train['ds'].max()]
fig.add_scatter(x=future_forecast['ds'], y=future_forecast['yhat'], mode='lines+markers', 
                name='Forecast (Future)', line=dict(dash='dash', color='red'))

fig.show()

In [None]:
groups = df_model["NEIGHBORHOOD"].unique()
mape_results = {}

for g in groups:
    subset = df_model[df_model["NEIGHBORHOOD"] == g].copy()
    
    # Drop rows without resolution or created dates
    subset = subset.dropna(subset=["RESOLUTION_TIME_DAYS", "CREATED DATE"])
    if len(subset) < 12:  # skip tiny groups
        continue
    
    # Compute regressors per group    
    # Case volume
    monthly_volume = (
        subset.groupby(pd.Grouper(key="CREATED DATE", freq="ME"))
        .size()
        .reset_index(name="case_volume")
    )
    
    # Dominant case-type proportion
    case_type_monthly = (
        subset.groupby([pd.Grouper(key="CREATED DATE", freq="ME"), "CASE TYPE"])
        .size()
        .reset_index(name="count")
    )
    month_totals = case_type_monthly.groupby("CREATED DATE")["count"].sum()
    case_type_monthly["prop"] = case_type_monthly["count"] / case_type_monthly["CREATED DATE"].map(month_totals)
    dominant_prop = (
        case_type_monthly.sort_values(["CREATED DATE", "prop"], ascending=[True, False])
        .groupby("CREATED DATE")
        .first()
        .reset_index()[["CREATED DATE", "prop"]]
        .rename(columns={"prop": "dominant_case_prop"})
    )
    
    # Build time series with regressors
    ts = (
        subset.groupby(pd.Grouper(key="CREATED DATE", freq="ME"))["RESOLUTION_TIME_DAYS"]
        .mean()
        .reset_index()
        .rename(columns={"CREATED DATE": "ds", "RESOLUTION_TIME_DAYS": "y"})
    )
    
    ts = (
        ts
        .merge(monthly_volume.rename(columns={"CREATED DATE": "ds"}), on="ds", how="left")
        .merge(dominant_prop.rename(columns={"CREATED DATE": "ds"}), on="ds", how="left")
    )
    
    # Interpolate missing months
    full_range = pd.date_range(ts["ds"].min(), ts["ds"].max(), freq="ME")
    ts = ts.set_index("ds").reindex(full_range)
    for col in ["y", "case_volume", "dominant_case_prop"]:
        ts[col] = ts[col].interpolate().bfill().ffill()
    ts = ts.rename_axis("ds").reset_index()
    
    # Train/test split
    train = ts.iloc[:-6]
    if train['y'].dropna().shape[0] < 2:
        continue
    test = ts.iloc[-6:]
    
    # Prophet with regressors
    model = Prophet(yearly_seasonality=True, changepoint_prior_scale=0.8)
    model.add_regressor("case_volume")
    model.add_regressor("dominant_case_prop")
    
    model.fit(train)
    
    # Build future dataframe
    future = model.make_future_dataframe(periods=12, freq="ME")
    future = future.merge(ts[["ds", "case_volume", "dominant_case_prop"]], on="ds", how="left")
    future["case_volume"] = future["case_volume"].ffill()
    future["dominant_case_prop"] = future["dominant_case_prop"].ffill()
    
    forecast = model.predict(future)
    
    # Evaluate
    pred = forecast[["ds", "yhat"]].merge(test[["ds", "y"]], on="ds", how="inner")
    mape_results[g] = mean_absolute_percentage_error(pred["y"], pred["yhat"]) * 100

# Final MAPE table
mape_df = pd.DataFrame.from_dict(mape_results, orient="index", columns=["MAPE"])
mape_df["MAPE"] = mape_df["MAPE"].apply(lambda x: round(x, 2))
mape_df.sort_values("MAPE", inplace=True)
mape_df


In [None]:
groups = df_model["DEPARTMENT"].unique()
mape_results = {}

for g in groups:
    subset = df_model[df_model["DEPARTMENT"] == g].copy()
    
    # Drop rows without resolution or created dates
    subset = subset.dropna(subset=["RESOLUTION_TIME_DAYS", "CREATED DATE"])
    if len(subset) < 12:  # skip tiny groups
        continue
    
    # Compute regressors per group    
    # Case volume
    monthly_volume = (
        subset.groupby(pd.Grouper(key="CREATED DATE", freq="ME"))
        .size()
        .reset_index(name="case_volume")
    )
    
    # Dominant case-type proportion
    case_type_monthly = (
        subset.groupby([pd.Grouper(key="CREATED DATE", freq="ME"), "CASE TYPE"])
        .size()
        .reset_index(name="count")
    )
    month_totals = case_type_monthly.groupby("CREATED DATE")["count"].sum()
    case_type_monthly["prop"] = case_type_monthly["count"] / case_type_monthly["CREATED DATE"].map(month_totals)
    dominant_prop = (
        case_type_monthly.sort_values(["CREATED DATE", "prop"], ascending=[True, False])
        .groupby("CREATED DATE")
        .first()
        .reset_index()[["CREATED DATE", "prop"]]
        .rename(columns={"prop": "dominant_case_prop"})
    )
    
    # Build time series with regressors
    ts = (
        subset.groupby(pd.Grouper(key="CREATED DATE", freq="ME"))["RESOLUTION_TIME_DAYS"]
        .mean()
        .reset_index()
        .rename(columns={"CREATED DATE": "ds", "RESOLUTION_TIME_DAYS": "y"})
    )
    
    ts = (
        ts
        .merge(monthly_volume.rename(columns={"CREATED DATE": "ds"}), on="ds", how="left")
        .merge(dominant_prop.rename(columns={"CREATED DATE": "ds"}), on="ds", how="left")
    )
    
    # Interpolate missing months
    full_range = pd.date_range(ts["ds"].min(), ts["ds"].max(), freq="ME")
    ts = ts.set_index("ds").reindex(full_range)
    for col in ["y", "case_volume", "dominant_case_prop"]:
        ts[col] = ts[col].interpolate().bfill().ffill()
    ts = ts.rename_axis("ds").reset_index()
    
    # Train/test split
    train = ts.iloc[:-6]
    if train['y'].dropna().shape[0] < 2:
        continue
    test = ts.iloc[-6:]
    
    # Prophet with regressors
    model = Prophet(yearly_seasonality=True, changepoint_prior_scale=0.8)
    model.add_regressor("case_volume")
    model.add_regressor("dominant_case_prop")
    
    model.fit(train)
    
    # Build future dataframe
    future = model.make_future_dataframe(periods=12, freq="ME")
    future = future.merge(ts[["ds", "case_volume", "dominant_case_prop"]], on="ds", how="left")
    future["case_volume"] = future["case_volume"].ffill()
    future["dominant_case_prop"] = future["dominant_case_prop"].ffill()
    
    forecast = model.predict(future)
    
    # Evaluate
    pred = forecast[["ds", "yhat"]].merge(test[["ds", "y"]], on="ds", how="inner")
    mape_results[g] = mean_absolute_percentage_error(pred["y"], pred["yhat"]) * 100

# Final MAPE table
mape_df = pd.DataFrame.from_dict(mape_results, orient="index", columns=["MAPE"])
mape_df["MAPE"] = mape_df["MAPE"].apply(lambda x: round(x, 2))
mape_df.sort_values("MAPE", inplace=True)
print(mape_df)

In [None]:
groups = df_model["DIVISION"].unique()
mape_results = {}
logging.getLogger("cmdstanpy").setLevel(logging.WARNING)

for g in groups:
    subset = df_model[df_model["DIVISION"] == g].copy()
    
    # Drop rows without resolution or created dates
    subset = subset.dropna(subset=["RESOLUTION_TIME_DAYS", "CREATED DATE"])
    if len(subset) < 12:  # skip tiny groups
        continue
    
    # Compute regressors per group    
    # Case volume
    monthly_volume = (
        subset.groupby(pd.Grouper(key="CREATED DATE", freq="ME"))
        .size()
        .reset_index(name="case_volume")
    )
    
    # Dominant case-type proportion
    case_type_monthly = (
        subset.groupby([pd.Grouper(key="CREATED DATE", freq="ME"), "CASE TYPE"])
        .size()
        .reset_index(name="count")
    )
    month_totals = case_type_monthly.groupby("CREATED DATE")["count"].sum()
    case_type_monthly["prop"] = case_type_monthly["count"] / case_type_monthly["CREATED DATE"].map(month_totals)
    dominant_prop = (
        case_type_monthly.sort_values(["CREATED DATE", "prop"], ascending=[True, False])
        .groupby("CREATED DATE")
        .first()
        .reset_index()[["CREATED DATE", "prop"]]
        .rename(columns={"prop": "dominant_case_prop"})
    )
    
    # Build time series with regressors
    ts = (
        subset.groupby(pd.Grouper(key="CREATED DATE", freq="ME"))["RESOLUTION_TIME_DAYS"]
        .mean()
        .reset_index()
        .rename(columns={"CREATED DATE": "ds", "RESOLUTION_TIME_DAYS": "y"})
    )
    
    ts = (
        ts
        .merge(monthly_volume.rename(columns={"CREATED DATE": "ds"}), on="ds", how="left")
        .merge(dominant_prop.rename(columns={"CREATED DATE": "ds"}), on="ds", how="left")
    )
    
    # Interpolate missing months
    full_range = pd.date_range(ts["ds"].min(), ts["ds"].max(), freq="ME")
    ts = ts.set_index("ds").reindex(full_range)
    for col in ["y", "case_volume", "dominant_case_prop"]:
        ts[col] = ts[col].interpolate().bfill().ffill()
    ts = ts.rename_axis("ds").reset_index()
    
    # Train/test split
    train = ts.iloc[:-6]
    if train['y'].dropna().shape[0] < 2:
        continue
    test = ts.iloc[-6:]
    
    # Prophet with regressors
    model = Prophet(yearly_seasonality=True, changepoint_prior_scale=0.8)
    model.add_regressor("case_volume")
    model.add_regressor("dominant_case_prop")
    
    model.fit(train)
    
    # Build future dataframe
    future = model.make_future_dataframe(periods=12, freq="ME")
    future = future.merge(ts[["ds", "case_volume", "dominant_case_prop"]], on="ds", how="left")
    future["case_volume"] = future["case_volume"].ffill()
    future["dominant_case_prop"] = future["dominant_case_prop"].ffill()
    
    forecast = model.predict(future)
    
    # Evaluate
    pred = forecast[["ds", "yhat"]].merge(test[["ds", "y"]], on="ds", how="inner")
    mape_results[g] = mean_absolute_percentage_error(pred["y"], pred["yhat"]) * 100

# Final MAPE table
mape_df = pd.DataFrame.from_dict(mape_results, orient="index", columns=["MAPE"])
mape_df["MAPE"] = mape_df["MAPE"].apply(lambda x: round(x, 2))
mape_df.sort_values("MAPE", inplace=True)
mape_df

In [None]:
groups = df_model["CATEGORY"].unique()
mape_results = {}

for g in groups:
    subset = df_model[df_model["CATEGORY"] == g].copy()
    
    # Drop rows without resolution or created dates
    subset = subset.dropna(subset=["RESOLUTION_TIME_DAYS", "CREATED DATE"])
    if len(subset) < 12:  # skip tiny groups
        continue
    
    # Compute regressors per group    
    # Case volume
    monthly_volume = (
        subset.groupby(pd.Grouper(key="CREATED DATE", freq="ME"))
        .size()
        .reset_index(name="case_volume")
    )
    
    # Dominant case-type proportion
    case_type_monthly = (
        subset.groupby([pd.Grouper(key="CREATED DATE", freq="ME"), "CASE TYPE"])
        .size()
        .reset_index(name="count")
    )
    month_totals = case_type_monthly.groupby("CREATED DATE")["count"].sum()
    case_type_monthly["prop"] = case_type_monthly["count"] / case_type_monthly["CREATED DATE"].map(month_totals)
    dominant_prop = (
        case_type_monthly.sort_values(["CREATED DATE", "prop"], ascending=[True, False])
        .groupby("CREATED DATE")
        .first()
        .reset_index()[["CREATED DATE", "prop"]]
        .rename(columns={"prop": "dominant_case_prop"})
    )
    
    # Build time series with regressors
    ts = (
        subset.groupby(pd.Grouper(key="CREATED DATE", freq="ME"))["RESOLUTION_TIME_DAYS"]
        .mean()
        .reset_index()
        .rename(columns={"CREATED DATE": "ds", "RESOLUTION_TIME_DAYS": "y"})
    )
    
    ts = (
        ts
        .merge(monthly_volume.rename(columns={"CREATED DATE": "ds"}), on="ds", how="left")
        .merge(dominant_prop.rename(columns={"CREATED DATE": "ds"}), on="ds", how="left")
    )
    
    # Interpolate missing months
    full_range = pd.date_range(ts["ds"].min(), ts["ds"].max(), freq="ME")
    ts = ts.set_index("ds").reindex(full_range)
    for col in ["y", "case_volume", "dominant_case_prop"]:
        ts[col] = ts[col].interpolate().bfill().ffill()
    ts = ts.rename_axis("ds").reset_index()
    
    # Train/test split
    train = ts.iloc[:-6]
    if train['y'].dropna().shape[0] < 2:
        continue
    test = ts.iloc[-6:]
    
    # Prophet with regressors
    model = Prophet(yearly_seasonality=True, changepoint_prior_scale=0.8)
    model.add_regressor("case_volume")
    model.add_regressor("dominant_case_prop")
    
    model.fit(train)
    
    # Build future dataframe
    future = model.make_future_dataframe(periods=12, freq="ME")
    future = future.merge(ts[["ds", "case_volume", "dominant_case_prop"]], on="ds", how="left")
    future["case_volume"] = future["case_volume"].ffill()
    future["dominant_case_prop"] = future["dominant_case_prop"].ffill()
    
    forecast = model.predict(future)
    
    # Evaluate
    pred = forecast[["ds", "yhat"]].merge(test[["ds", "y"]], on="ds", how="inner")
    mape_results[g] = mean_absolute_percentage_error(pred["y"], pred["yhat"]) * 100

# Final MAPE table
mape_df = pd.DataFrame.from_dict(mape_results, orient="index", columns=["MAPE"])
mape_df["MAPE"] = mape_df["MAPE"].apply(lambda x: round(x, 2))
mape_df.sort_values("MAPE", inplace=True)
mape_df