In [None]:
filename = "../data/Hospital_Inpatient_Discharges_(SPARCS_De-Identified)__2024_20260123.csv"
TRAIN_SIZE = 0.8
SEED = 42

In [None]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, root_mean_squared_error

pd.set_option('display.max_columns', None)
np.random.seed(SEED)

In [None]:
df = pd.read_csv(filename,dtype=str)
df.columns = df.columns.str.replace(r"[^0-9a-zA-Z]+", " ", regex=True) \
                        .str.title() \
                        .str.replace(" ", "", regex=False) # Columns to CamelCase with no spaces
df.drop(df[df["LengthOfStay"] == "120+"].index, inplace=True) # Remove for now (capping would introduce bias)
for col in ["LengthOfStay", "BirthWeight", "TotalCharges", "TotalCosts"]:
    df[col] = pd.to_numeric(df[col], errors='coerce') # Convert to numeric, setting errors to NaN
cat_cols = df.select_dtypes(include=str).columns # Transform the rest to categories
for c in cat_cols:
    df[c] = df[c].astype("category")

y = df["LengthOfStay"]
X = df.drop(columns=["LengthOfStay"])

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    train_size=TRAIN_SIZE,
    random_state=SEED
)


In [None]:
model = LGBMRegressor(
    n_estimators=500,
    learning_rate=0.05,
    random_state=SEED
)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("RÂ²:", r2_score(y_test, y_pred))
print("RMSE:", root_mean_squared_error(y_test, y_pred))