In [1]:
import pandas as pd
import numpy as np
import os

# from azure.ai.ml import MLClient
# from azure.identity import DefaultAzureCredential
# from azure.ai.ml.entities import Data

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
df = pd.read_csv(
    '/content/Hospital_Inpatient_Discharges_(SPARCS_De-Identified)__2024_20251129.csv',
    engine='python',
    on_bad_lines='warn'  # This helps skip/warn about broken lines rather than crashing
)

In [None]:
df.tail()

Unnamed: 0,Health Service Area,Hospital County,Operating Certificate Number,Permanent Facility Id,Facility Name,Age Group,Zip Code,Gender,Race,Ethnicity,...,APR Severity of Illness Description,APR Risk of Mortality,APR Medical Surgical Description,Payment Typology 1,Payment Typology 2,Payment Typology 3,Birth Weight,Emergency Department Indicator,Total Charges,Total Costs
2196732,New York City,New York,7002024.0,1456.0,MOUNT SINAI HOSPITAL,70 or Older,111,F,Multi-racial,Not Span/Hispanic,...,Moderate,Moderate,Surgical,Medicare,Medicare,Medicaid,,N,59727.81,17084.6
2196733,Central NY,Jefferson,2238700.0,379.0,CARTHAGE AREA HOSPITAL INC,18-29,136,F,White,Not Span/Hispanic,...,Minor,Minor,Surgical,Federal/State/Local/VA,,,,N,25474.57,43365.68
2196734,New York City,Kings,7001016.0,1301.0,KINGS COUNTY HOSPITAL CENTER,30-49,112,F,Black/African American,Unknown,...,Moderate,Minor,Surgical,Medicaid,,,,N,51617.21,28211.54
2196735,Long Island,Nassau,7002053.0,511.0,NYU LANGONE HOSPITAL-LONG ISLAND,30-49,117,F,White,Not Span/Hispanic,...,Major,Moderate,Medical,Blue Cross/Blue Shield,,,,N,60404.55,19568.72
2196736,New York City,Kings,7001035.0,1318.0,WYCKOFF HEIGHTS MEDICAL CENTER,30-49,113,F,Other Race,Spanish/Hispanic,...,Minor,Minor,Medical,Private Health Insurance,,,,Y,6179.07,2238.09


In [None]:
df.columns

Index(['Health Service Area', 'Hospital County',
       'Operating Certificate Number', 'Permanent Facility Id',
       'Facility Name', 'Age Group', 'Zip Code', 'Gender', 'Race', 'Ethnicity',
       'Length of Stay', 'Type of Admission', 'Patient Disposition',
       'Discharge Year', 'CCSR Diagnosis Code', 'CCSR Diagnosis Description',
       'CCSR Procedure Code', 'CCSR Procedure Description', 'APR DRG Code',
       'APR DRG Description', 'APR MDC Code', 'APR MDC Description',
       'APR Severity of Illness Code', 'APR Severity of Illness Description',
       'APR Risk of Mortality', 'APR Medical Surgical Description',
       'Payment Typology 1', 'Payment Typology 2', 'Payment Typology 3',
       'Birth Weight', 'Emergency Department Indicator', 'Total Charges',
       'Total Costs'],
      dtype='object')

In [3]:
len(df)

2196737

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2196737 entries, 0 to 2196736
Data columns (total 33 columns):
 #   Column                               Dtype  
---  ------                               -----  
 0   Health Service Area                  object 
 1   Hospital County                      object 
 2   Operating Certificate Number         float64
 3   Permanent Facility Id                float64
 4   Facility Name                        object 
 5   Age Group                            object 
 6   Zip Code                             object 
 7   Gender                               object 
 8   Race                                 object 
 9   Ethnicity                            object 
 10  Length of Stay                       object 
 11  Type of Admission                    object 
 12  Patient Disposition                  object 
 13  Discharge Year                       int64  
 14  CCSR Diagnosis Code                  object 
 15  CCSR Diagnosis Description      

In [4]:
df_processed = df.copy()

cols_not_to_keep = ['Total Charges','Permanent Facility Id', 'Zip Code', 'Discharge Year', 'CCSR Diagnosis Code', 'CCSR Procedure Code', 'APR DRG Code', 'APR MDC Code', 'APR Severity of Illness Code', 'Birth Weight']
df_processed = df_processed.drop(columns=cols_not_to_keep)

fill_nulls = ['Health Service Area', 'Hospital County', 'Operating Certificate Number', 'CCSR Procedure Description']
df_processed[fill_nulls] = df_processed[fill_nulls].fillna("Unknown")

def select_payment(row):
    if pd.notna(row['Payment Typology 1']):
        return row['Payment Typology 1']
    elif pd.notna(row['Payment Typology 2']):
        return row['Payment Typology 2']
    elif pd.notna(row['Payment Typology 3']):
        return row['Payment Typology 3']
    else:
        return "Unknown"

df_processed['Payment_Typology'] = df_processed.apply(select_payment, axis=1)
df_processed = df_processed.drop(columns=['Payment Typology 1', 'Payment Typology 2', 'Payment Typology 3'])

df_processed['Length of Stay'] = df_processed['Length of Stay'].replace('120+', 120).astype(int)

df_processed['Total Costs'] = np.log1p(df_processed['Total Costs'])

In [5]:
len(df_processed)

2196737

In [5]:
categorical_cols = df_processed.select_dtypes(include=['object']).columns

# Convert all categorical columns to string
for col in categorical_cols:
    df_processed[col] = df_processed[col].astype(str)

In [6]:
target = "Total Costs"

# Drop columns you don’t want or that leak information (keep what makes sense)
X = df_processed.drop(columns=['Total Costs'])
y = df_processed[target]

In [7]:
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numeric_cols)
    ]
)

# Random Forest

In [None]:
rf = RandomForestRegressor(
    n_estimators=50,
    max_depth=None,
    min_samples_split=5,
    random_state=42,
    n_jobs=16,
)

In [None]:
model_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('rf', rf)
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model_rf.fit(X_train, y_train)

In [13]:
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    mean_absolute_percentage_error
)
import numpy as np

# ---- Inverse transform (back to original scale) ----
y_train_true = np.expm1(y_train)
y_test_true  = np.expm1(y_test)

y_train_pred = np.expm1(model_rf.predict(X_train))
y_test_pred  = np.expm1(model_rf.predict(X_test))

# ---- Evaluation function ----
def evaluate(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = mean_absolute_percentage_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, mape, r2

# ---- Compute metrics ----
train_mae, train_rmse, train_mape, train_r2 = evaluate(y_train_true, y_train_pred)
test_mae, test_rmse, test_mape, test_r2 = evaluate(y_test_true, y_test_pred)

# ---- Print ----
print("=== TRAINING SET METRICS ===")
print(f"MAE:  {train_mae:.2f}")
print(f"RMSE: {train_rmse:.2f}")
print(f"MAPE: {train_mape*100:.2f}%")
print(f"R²:   {train_r2:.4f}")

print("\n=== TEST SET METRICS ===")
print(f"MAE:  {test_mae:.2f}")
print(f"RMSE: {test_rmse:.2f}")
print(f"MAPE: {test_mape*100:.2f}%")
print(f"R²:   {test_r2:.4f}")

=== TRAINING SET METRICS ===
MAE:  6795.07
RMSE: 27167.16
MAPE: 886393263856850.00%
R²:   0.7576

=== TEST SET METRICS ===
MAE:  6890.21
RMSE: 28223.34
MAPE: 31.09%
R²:   0.7471


# XGBoost

In [9]:
from xgboost import XGBRegressor

xgb = XGBRegressor(
    tree_method='auto',
    predictor='gpu_predictor',
    n_estimators=100,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

In [10]:
model_xgb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('rf', xgb)
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model_xgb.fit(X_train, y_train)

Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [22]:
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    mean_absolute_percentage_error
)
import numpy as np

# ---- Inverse transform (back to original scale) ----
y_train_true = np.expm1(y_train)
y_test_true  = np.expm1(y_test)

y_train_pred = np.expm1(model_xgb.predict(X_train))
y_test_pred  = np.expm1(model_xgb.predict(X_test))

# ---- Evaluation function ----
def evaluate(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = mean_absolute_percentage_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, mape, r2

# ---- Compute metrics ----
train_mae, train_rmse, train_mape, train_r2 = evaluate(y_train_true, y_train_pred)
test_mae, test_rmse, test_mape, test_r2 = evaluate(y_test_true, y_test_pred)

# ---- Print ----
print("=== TRAINING SET METRICS ===")
print(f"MAE:  {train_mae:.2f}")
print(f"RMSE: {train_rmse:.2f}")
print(f"MAPE: {train_mape*100:.2f}%")
print(f"R²:   {train_r2:.4f}")

print("\n=== TEST SET METRICS ===")
print(f"MAE:  {test_mae:.2f}")
print(f"RMSE: {test_rmse:.2f}")
print(f"MAPE: {test_mape*100:.2f}%")
print(f"R²:   {test_r2:.4f}")

=== TRAINING SET METRICS ===
MAE:  5779.41
RMSE: 23786.20
MAPE: 198939860325214.09%
R²:   0.8142

=== TEST SET METRICS ===
MAE:  5907.67
RMSE: 24872.56
MAPE: 26.54%
R²:   0.8036


# Light GBM

In [23]:
from lightgbm import LGBMRegressor

lgbm = LGBMRegressor(
    n_estimators=200,
    max_depth=8,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    device="gpu",           # Enable GPU
    gpu_platform_id=0,
    gpu_device_id=0
)

In [24]:
model_lgb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lgbm', lgbm)
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model_lgb.fit(X_train, y_train)

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 3321
[LightGBM] [Info] Number of data points in the train set: 1757389, number of used features: 1601
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: NVIDIA L4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 19 dense feature groups (33.52 MB) transferred to GPU in 0.035669 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 9.579427


In [25]:
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    mean_absolute_percentage_error
)
import numpy as np

# ---- Inverse transform (back to original scale) ----
y_train_true = np.expm1(y_train)
y_test_true  = np.expm1(y_test)

y_train_pred = np.expm1(model_lgb.predict(X_train))
y_test_pred  = np.expm1(model_lgb.predict(X_test))

# ---- Compute metrics ----
train_mae, train_rmse, train_mape, train_r2 = evaluate(y_train_true, y_train_pred)
test_mae, test_rmse, test_mape, test_r2 = evaluate(y_test_true, y_test_pred)

# ---- Print ----
print("=== TRAINING SET METRICS ===")
print(f"MAE:  {train_mae:.2f}")
print(f"RMSE: {train_rmse:.2f}")
print(f"MAPE: {train_mape*100:.2f}%")
print(f"R²:   {train_r2:.4f}")

print("\n=== TEST SET METRICS ===")
print(f"MAE:  {test_mae:.2f}")
print(f"RMSE: {test_rmse:.2f}")
print(f"MAPE: {test_mape*100:.2f}%")
print(f"R²:   {test_r2:.4f}")



=== TRAINING SET METRICS ===
MAE:  6252.39
RMSE: 26343.61
MAPE: 1266885859383144.25%
R²:   0.7721

=== TEST SET METRICS ===
MAE:  6337.79
RMSE: 26956.90
MAPE: 27.99%
R²:   0.7693
