In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 1. Read the data
df = pd.read_csv("cleaned_hospital.csv")

In [2]:
# 2. Features / target
feature_cols = ['Facility Id',
                'APR DRG Code',
                'APR Severity of Illness Code',
                'Year',
                'APR Medical Surgical Code']
X = df[feature_cols]
y = df['Mean Cost']

ohe = OneHotEncoder(handle_unknown="ignore")
preprocess = ColumnTransformer(
    transformers=[("cat", ohe, feature_cols)],
    remainder="drop"
)

alphas = np.logspace(-3, 3, 13)
ridge = RidgeCV(alphas=alphas,
                cv=KFold(n_splits=5, shuffle=True, random_state=42),
                scoring="neg_mean_squared_error")


In [3]:
# 6. Complete pipeline
model = Pipeline(steps=[
    ("prep", preprocess),
    ("ridge", ridge)
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.10, random_state=42, stratify=None
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
best_alpha = model.named_steps["ridge"].alpha_

print(f"Best α from CV : {best_alpha:.4g}")
print(f"Test RMSE      : {rmse:,.2f}")
print(f"Test MAE       : {mae:,.2f}")
print(f"Test R²        : {r2:.4f}")

Best α from CV : 3.162
Test RMSE      : 26,482.43
Test MAE       : 10,104.34
Test R²        : 0.4048
