# Resource Demand Forecasting Model
Predict facility-level resource needs for treatment planning and allocation optimization.


## Import Libraries
Load required packages for modeling and evaluation

In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use("seaborn-v0_8-darkgrid")
sns.set_palette("husl")

## Load ML-Ready Data
Read the imputed dataset prepared for machine learning

In [24]:
df = pd.read_csv("1_datasets/processed/teds_ml_ready.csv")

  df = pd.read_csv("1_datasets/processed/teds_ml_ready.csv")


## Aggregate Data by State and Service Type
Calculate resource demand metrics at facility level (state × service type)

In [None]:
demand_by_state_service = (
    df.groupby(["state", "service_type"])
    .agg(
        {
            "patient_id": "count",
            "age_group": "first",
            "sex": "first",
            "race": "first",
            "ethnicity": "first",
            "marital_status": "first",
            "education_level": "first",
            "employment_status": "first",
            "living_arrangement": "first",
            "income_source": "first",
            "recent_arrests": "first",
            "prior_treatments": "first",
            "primary_substance": "first",
            "secondary_substance": "first",
            "tertiary_substance": "first",
            "route_primary": "first",
            "frequency_primary": "first",
            "age_first_use_primary": "first",
            "medication_assisted_therapy": "first",
            "has_cooccurring_mental_health": "first",
            "health_insurance": "first",
            "payment_source": "first",
            "self_help_attendance": "first",
            "injection_drug_use": "first",
            "pregnant": "first",
            "veteran_status": "first",
            "years_using": "mean",
            "number_of_substances": "mean",
            "is_polysubstance": "mean",
            "is_opioid_primary": "mean",
            "is_stimulant_primary": "mean",
            "is_injection_user": "mean",
            "is_criminal_justice_referral": "mean",
            "has_recent_arrest": "mean",
            "is_chronic_treatment": "mean",
            "is_first_treatment": "mean",
            "is_adolescent": "mean",
            "is_older_adult": "mean",
            "is_pregnant": "mean",
            "is_homeless": "mean",
            "has_no_income": "mean",
            "has_mental_health_disorder": "mean",
            "substance_category": "first",
            "region": "first",
        }
    )
    .reset_index()
)

demand_by_state_service.rename(columns={"patient_id": "total_admissions"}, inplace=True)

## Calculate Resource Intensity Metrics
Create complexity scores and utilization rates

In [6]:
feature_cols = [
    "state",
    "region",
    "service_type",
    "age_group",
    "sex",
    "race",
    "ethnicity",
    "marital_status",
    "education_level",
    "employment_status",
    "living_arrangement",
    "income_source",
    "recent_arrests",
    "prior_treatments",
    "primary_substance",
    "secondary_substance",
    "tertiary_substance",
    "route_primary",
    "frequency_primary",
    "age_first_use_primary",
    "medication_assisted_therapy",
    "has_cooccurring_mental_health",
    "health_insurance",
    "payment_source",
    "self_help_attendance",
    "injection_drug_use",
    "pregnant",
    "veteran_status",
    "years_using",
    "number_of_substances",
    "is_polysubstance",
    "is_opioid_primary",
    "is_stimulant_primary",
    "is_injection_user",
    "is_criminal_justice_referral",
    "has_recent_arrest",
    "is_chronic_treatment",
    "is_first_treatment",
    "is_adolescent",
    "is_older_adult",
    "is_pregnant",
    "is_homeless",
    "has_no_income",
    "has_mental_health_disorder",
    "substance_category",
]

## Aggregate Regional Demand Patterns
Calculate region-level statistics for regional features

In [7]:
categorical_cols = (
    demand_by_state_service[feature_cols].select_dtypes(include="object").columns
)
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    demand_by_state_service[col] = le.fit_transform(
        demand_by_state_service[col].astype(str)
    )
    label_encoders[col] = le


## Prepare Features for Modeling
Encode categorical variables and create feature matrix

In [8]:
X = demand_by_state_service[feature_cols]
y = demand_by_state_service["total_admissions"]

## Split Data into Train and Test Sets
80-20 train-test split for model evaluation

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Model 1: Linear Regression (Baseline)
Simple linear model as performance baseline

In [10]:
lr_model = Ridge(alpha=10.0)
lr_model.fit(X_train, y_train)

0,1,2
,alpha,10.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


## Model 2: Random Forest Regressor
Ensemble model capturing non-linear relationships

In [11]:
rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=5,
    min_samples_leaf=3,
    max_features="sqrt",
    random_state=42,
    n_jobs=-1,
)
rf_model.fit(X_train, y_train)

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,3
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


## Model 3: Gradient Boosting Regressor
Advanced boosting model for best performance

In [12]:
X_train_gb, X_val_gb, y_train_gb, y_val_gb = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

gb_model = GradientBoostingRegressor(
    n_estimators=1000,
    max_depth=3,
    learning_rate=0.01,
    validation_fraction=0.2,
    n_iter_no_change=50,
    random_state=42,
)
gb_model.fit(X_train_gb, y_train_gb)

0,1,2
,loss,'squared_error'
,learning_rate,0.01
,n_estimators,1000
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


## Compare Model Performance
Create comparison dataframe of all models

In [13]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    return {
        "train_mae": mean_absolute_error(y_train, y_train_pred),
        "test_mae": mean_absolute_error(y_test, y_test_pred),
        "train_rmse": mean_squared_error(y_train, y_train_pred) ** 0.5,
        "test_rmse": mean_squared_error(y_test, y_test_pred) ** 0.5,
        "train_r2": r2_score(y_train, y_train_pred),
        "test_r2": r2_score(y_test, y_test_pred),
    }


results = {
    "Ridge Regression": evaluate_model(lr_model, X_train, X_test, y_train, y_test),
    "Random Forest": evaluate_model(rf_model, X_train, X_test, y_train, y_test),
    "Gradient Boosting": evaluate_model(gb_model, X_train, X_test, y_train, y_test),
}


## Select Best Model
Choose model with highest R² score

In [14]:
model_comparison = (
    pd.DataFrame(results).T.reset_index().rename(columns={"index": "model"})
)

## Feature Importance Analysis
Extract and rank feature importance from best model

In [15]:
best_model_idx = model_comparison["test_r2"].idxmax()
best_model_name = model_comparison.loc[best_model_idx, "model"]

if best_model_name == "Random Forest":
    best_model = rf_model
elif best_model_name == "Gradient Boosting":
    best_model = gb_model
else:
    best_model = lr_model


In [16]:
if best_model_name == "Random Forest":
    feature_importance_df = pd.DataFrame(
        {"feature": feature_cols, "importance": rf_model.feature_importances_}
    ).sort_values("importance", ascending=False)

## Generate Predictions for All Facilities
Predict demand for all state-service combinations

In [17]:
demand_by_state_service["predicted_admissions"] = best_model.predict(X)
demand_by_state_service["prediction_error"] = (
    demand_by_state_service["predicted_admissions"]
    - demand_by_state_service["total_admissions"]
)
demand_by_state_service["absolute_error"] = np.abs(
    demand_by_state_service["prediction_error"]
)
demand_by_state_service["percent_error"] = (
    demand_by_state_service["absolute_error"]
    / demand_by_state_service["total_admissions"]
    * 100
)

## Calculate Resource Allocation Recommendations
Estimate beds and staff needed based on predicted demand

In [18]:
demand_by_state_service["complexity_score"] = (
    demand_by_state_service["is_polysubstance"] * 1.5
    + demand_by_state_service["is_chronic_treatment"] * 2.0
    + demand_by_state_service["has_mental_health_disorder"] * 1.8
    + demand_by_state_service["is_homeless"] * 1.5
    + demand_by_state_service["is_injection_user"] * 2.0
)

demand_by_state_service["recommended_beds"] = np.ceil(
    demand_by_state_service["predicted_admissions"] / 12
)

demand_by_state_service["recommended_staff"] = np.ceil(
    demand_by_state_service["predicted_admissions"]
    / 50
    * (1 + demand_by_state_service["complexity_score"] / 10)
)

## Identify Under-Resourced Facilities
Flag facilities with high demand and long wait times

In [19]:
median_admissions = demand_by_state_service["total_admissions"].median()
demand_by_state_service["under_resourced"] = (
    demand_by_state_service["predicted_admissions"] > median_admissions
).astype(int)

## Aggregate State-Level Summary
Roll up facility predictions to state level


In [20]:
state_summary = (
    demand_by_state_service.groupby("state")
    .agg(
        {
            "total_admissions": "sum",
            "predicted_admissions": "sum",
            "recommended_beds": "sum",
            "recommended_staff": "sum",
            "under_resourced": "sum",
            "complexity_score": "mean",
        }
    )
    .reset_index()
)

state_summary["beds_per_1000_admissions"] = (
    state_summary["recommended_beds"] / state_summary["total_admissions"] * 1000
)

In [21]:
def predict_facility_resources(
    facility_features,
    model=best_model,
    encoders=label_encoders,
    median_admissions=median_admissions,
):
    feature_values = []
    for col in feature_cols:
        value = facility_features.get(col, 0)
        if col in encoders:
            value = encoders[col].transform([str(value)])[0]
        feature_values.append(value)

    X_new = np.array([feature_values])
    predicted_admissions = model.predict(X_new)[0]

    complexity_score = (
        facility_features.get("is_polysubstance", 0) * 1.5
        + facility_features.get("is_chronic_treatment", 0) * 2.0
        + facility_features.get("has_mental_health_disorder", 0) * 1.8
        + facility_features.get("is_homeless", 0) * 1.5
        + facility_features.get("is_injection_user", 0) * 2.0
    )

    total_patients = facility_features.get("total_admissions", predicted_admissions)
    if total_patients > 0:
        complexity_score /= total_patients
    else:
        complexity_score = 0

    recommended_beds = np.ceil(predicted_admissions / 12)
    recommended_staff = np.ceil(predicted_admissions / 50 * (1 + complexity_score / 10))
    under_resourced = int(predicted_admissions > median_admissions)

    return {
        "predicted_admissions": predicted_admissions,
        "recommended_beds": recommended_beds,
        "recommended_staff": recommended_staff,
        "complexity_score": complexity_score,
        "under_resourced": under_resourced,
    }


In [22]:
display(model_comparison)

Unnamed: 0,model,train_mae,test_mae,train_rmse,test_rmse,train_r2,test_r2
0,Ridge Regression,6524.759237,7548.957351,13026.976628,11531.424526,0.170899,-0.061884
1,Random Forest,4703.267921,6586.708549,10601.761131,10546.242191,0.450869,0.111808
2,Gradient Boosting,4292.664063,5854.033552,13031.56433,11032.381839,0.170315,0.028037


In [None]:
from sklearn.linear_model import Ridge
df = pd.read_csv("1_datasets/processed/teds_ml_ready.csv")

demand_by_state_service = (
    df.groupby(["state", "service_type"])
    .agg(
        {
            "patient_id": "count",  
            "age_group": "first",
            "sex": "first",
            "race": "first",
            "ethnicity": "first",
            "marital_status": "first",
            "education_level": "first",
            "employment_status": "first",
            "living_arrangement": "first",
            "income_source": "first",
            "recent_arrests": "first",
            "prior_treatments": "first",
            "primary_substance": "first",
            "secondary_substance": "first",
            "tertiary_substance": "first",
            "route_primary": "first",
            "frequency_primary": "first",
            "age_first_use_primary": "first",
            "medication_assisted_therapy": "first",
            "has_cooccurring_mental_health": "first",
            "health_insurance": "first",
            "payment_source": "first",
            "self_help_attendance": "first",
            "injection_drug_use": "first",
            "pregnant": "first",
            "veteran_status": "first",
            "years_using": "first",
            "number_of_substances": "first",
            "is_polysubstance": "mean",
            "is_opioid_primary": "mean",
            "is_stimulant_primary": "mean",
            "is_injection_user": "mean",
            "is_criminal_justice_referral": "mean",
            "has_recent_arrest": "mean",
            "is_chronic_treatment": "mean",
            "is_first_treatment": "mean",
            "is_adolescent": "mean",
            "is_older_adult": "mean",
            "is_pregnant": "mean",
            "is_homeless": "mean",
            "has_no_income": "mean",
            "has_mental_health_disorder": "mean",
            "substance_category": "first",
            "region": "first",
        }
    )
    .reset_index()
)

demand_by_state_service.rename(columns={"patient_id": "total_admissions"}, inplace=True)

feature_cols = [
    "state",
    "region",
    "service_type",
    "age_group",
    "sex",
    "race",
    "ethnicity",
    "marital_status",
    "education_level",
    "employment_status",
    "living_arrangement",
    "income_source",
    "recent_arrests",
    "prior_treatments",
    "primary_substance",
    "secondary_substance",
    "tertiary_substance",
    "route_primary",
    "frequency_primary",
    "age_first_use_primary",
    "medication_assisted_therapy",
    "has_cooccurring_mental_health",
    "health_insurance",
    "payment_source",
    "self_help_attendance",
    "injection_drug_use",
    "pregnant",
    "veteran_status",
    "years_using",
    "number_of_substances",
    "is_polysubstance",
    "is_opioid_primary",
    "is_stimulant_primary",
    "is_injection_user",
    "is_criminal_justice_referral",
    "has_recent_arrest",
    "is_chronic_treatment",
    "is_first_treatment",
    "is_adolescent",
    "is_older_adult",
    "is_pregnant",
    "is_homeless",
    "has_no_income",
    "has_mental_health_disorder",
    "substance_category",
]

categorical_cols = (
    demand_by_state_service[feature_cols].select_dtypes(include="object").columns
)
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    demand_by_state_service[col] = le.fit_transform(
        demand_by_state_service[col].astype(str)
    )
    label_encoders[col] = le

X = demand_by_state_service[feature_cols]
y = demand_by_state_service["total_admissions"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

lr_model = Ridge(alpha=10.0).fit(X_train, y_train)

rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=5,
    min_samples_leaf=3,
    max_features="sqrt",
    random_state=42,
    n_jobs=-1,
).fit(X_train, y_train)

X_train_gb, X_val_gb, y_train_gb, y_val_gb = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

gb_model = GradientBoostingRegressor(
    n_estimators=1000,
    max_depth=3,
    learning_rate=0.01,
    validation_fraction=0.2,
    n_iter_no_change=50,
    random_state=42,
)
gb_model.fit(X_train_gb, y_train_gb)


def evaluate_model(model, X_train, X_test, y_train, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    return {
        "train_mae": mean_absolute_error(y_train, y_train_pred),
        "test_mae": mean_absolute_error(y_test, y_test_pred),
        "train_rmse": mean_squared_error(y_train, y_train_pred) ** 0.5,
        "test_rmse": mean_squared_error(y_test, y_test_pred) ** 0.5,
        "train_r2": r2_score(y_train, y_train_pred),
        "test_r2": r2_score(y_test, y_test_pred),
    }


results = {
    "Ridge Regression": evaluate_model(lr_model, X_train, X_test, y_train, y_test),
    "Random Forest": evaluate_model(rf_model, X_train, X_test, y_train, y_test),
    "Gradient Boosting": evaluate_model(gb_model, X_train, X_test, y_train, y_test),
}

model_comparison = (
    pd.DataFrame(results).T.reset_index().rename(columns={"index": "model"})
)
display(model_comparison)
def predict_facility_resources(
    facility_features, model=rf_model, encoders=label_encoders, median_admissions=None
):
    feature_values = []
    for col in feature_cols:
        value = facility_features.get(col, 0)  
        if col in encoders:
            value = encoders[col].transform([str(value)])[0]
        feature_values.append(value)

    X_new = np.array([feature_values])

    predicted_admissions = model.predict(X_new)[0]

    complexity_score = (
        facility_features.get("is_polysubstance", 0) * 1.5
        + facility_features.get("is_chronic_treatment", 0) * 2.0
        + facility_features.get("has_mental_health_disorder", 0) * 1.8
        + facility_features.get("is_homeless", 0) * 1.5
        + facility_features.get("is_injection_user", 0) * 2.0
    )

    total_patients = facility_features.get("total_admissions", predicted_admissions)
    if total_patients > 0:
        complexity_score /= total_patients
    else:
        complexity_score = 0

    recommended_beds = np.ceil(predicted_admissions / 12)
    recommended_staff = np.ceil(predicted_admissions / 50 * (1 + complexity_score / 10))

    if median_admissions is None:
        median_admissions = predicted_admissions
    under_resourced = int(predicted_admissions > median_admissions)

    return {
        "predicted_admissions": predicted_admissions,
        "recommended_beds": recommended_beds,
        "recommended_staff": recommended_staff,
        "complexity_score": complexity_score,
        "under_resourced": under_resourced,
    }


  df = pd.read_csv("1_datasets/processed/teds_ml_ready.csv")


Unnamed: 0,model,train_mae,test_mae,train_rmse,test_rmse,train_r2,test_r2
0,Ridge Regression,6524.759237,7548.957351,13026.976628,11531.424526,0.170899,-0.061884
1,Random Forest,4703.267921,6586.708549,10601.761131,10546.242191,0.450869,0.111808
2,Gradient Boosting,4292.664063,5854.033552,13031.56433,11032.381839,0.170315,0.028037


In [60]:
def predict_facility_resources(
    facility_features, model=rf_model, encoders=label_encoders, median_admissions=None
):
    feature_values = []
    for col in feature_cols:
        value = facility_features.get(col, 0)  
        if col in encoders:
            value = encoders[col].transform([str(value)])[0]
        feature_values.append(value)

    X_new = np.array([feature_values])

    predicted_admissions = model.predict(X_new)[0]

    complexity_score = (
        facility_features.get("is_polysubstance", 0) * 1.5
        + facility_features.get("is_chronic_treatment", 0) * 2.0
        + facility_features.get("has_mental_health_disorder", 0) * 1.8
        + facility_features.get("is_homeless", 0) * 1.5
        + facility_features.get("is_injection_user", 0) * 2.0
    )

    total_patients = facility_features.get("total_admissions", predicted_admissions)
    if total_patients > 0:
        complexity_score /= total_patients
    else:
        complexity_score = 0

    recommended_beds = np.ceil(predicted_admissions / 12)
    recommended_staff = np.ceil(predicted_admissions / 50 * (1 + complexity_score / 10))

    if median_admissions is None:
        median_admissions = predicted_admissions
    under_resourced = int(predicted_admissions > median_admissions)

    return {
        "predicted_admissions": predicted_admissions,
        "recommended_beds": recommended_beds,
        "recommended_staff": recommended_staff,
        "complexity_score": complexity_score,
        "under_resourced": under_resourced,
    }
