In [None]:
filename = "../data/SPARCS_2024_simplified.csv"
TRAIN_SIZE = 0.8
SEED = 42

In [None]:
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, root_mean_squared_error
from sklearn.pipeline import Pipeline


pd.set_option('display.max_columns', None)
np.random.seed(SEED)

def preprocessing(df):
    df.columns = df.columns.str.replace(r"[^0-9a-zA-Z]+", " ", regex=True) \
                            .str.title() \
                            .str.replace(" ", "", regex=False) # Columns to CamelCase with no spaces
    df.drop(df[df["LengthOfStay"] == "120+"].index, inplace=True) # Remove for now (capping would introduce bias)
    for col in ["LengthOfStay", "BirthWeight", "TotalCharges", "TotalCosts"]:
        df[col] = pd.to_numeric(df[col], errors='coerce') # Convert to numeric, setting errors to NaN
    cat_cols = df.select_dtypes(include=str).columns # Transform the rest to categories
    for c in cat_cols:
        df[c] = df[c].astype("category")
    return df
def list_of_repeated_cols(df):
    def maps_to_single(df, col1, col2, dropna=False):
        s = df.groupby(col1, dropna=not dropna)[col2].nunique(dropna=dropna)
        return s.max() == 1
    
    cats = df.select_dtypes(include="category").columns

    one_to_one = []
    for col1 in cats:
        for col2 in cats:
            if col1 >= col2:  # prevents duplicates, keeps unordered pairs once
                continue
            if maps_to_single(df, col1, col2) and maps_to_single(df, col2, col1):
                one_to_one.append((col1, col2))
    return one_to_one

In [None]:
df = pd.read_csv(filename,dtype=str)
df = preprocessing(df)

# Dropping columns
equivalent_pairs = list_of_repeated_cols(df)
print("Columns with one-to-one mappings: ", equivalent_pairs)
df = df.drop(columns=[pair[1] for pair in equivalent_pairs]) # Drop diagnosis, procedure, APR DRG, APR MDC, and AprSeverityOfIllness descriptions, and keep its codes
df = df.drop(columns=["FacilityName"]) # FacilityName is also represented in PermanentFacilityId
df = df.drop(columns=["TotalCharges", "TotalCosts"]) # 

# Ordinal Categorical Variables
age_order = ['0-17', '18-29', '30-49', '50-69', '70 or Older']
age_cat_type = CategoricalDtype(categories=age_order, ordered=True)
df['AgeGroup'] = df['AgeGroup'].astype(age_cat_type)

# Additional Supporting Variables
df["is_surgical"] = (df["AprMedicalSurgicalDescription"] == "Surgical").astype(int)
df["high_severity"] = df["AprSeverityOfIllnessCode"].isin(["3", "4"]).astype(int)
df["is_emergency"] = (df["EmergencyDepartmentIndicator"] == "Y").astype(int)
df["is_elective"] = (df["TypeOfAdmission"] == "Elective").astype(int)
df["discharged_home"] = df["PatientDisposition"].str.contains("Home", na=False).astype(int)
df["is_elderly"] = df["AgeGroup"].isin(["70 or Older"]).astype(int)

# Log Transform LOS
df["LengthOfStay"] = np.log1p(df["LengthOfStay"]) # NOTE: PREDICTION HAS TO BE TRANSFORMED BACK

df.sample(5)

In [None]:
# High Cardinality Categorical Variables
d = {}
categoric_cols = df.select_dtypes(include=['object', 'category']).columns
for col in categoric_cols:
    d[col] = df[col].nunique(dropna=True)
num_uniquecats_df = pd.DataFrame.from_dict(d, orient='index', columns=['Unique Values'])
num_uniquecats_df.sort_values(by='Unique Values', ascending=False)


In [None]:
df.PatientDisposition.value_counts()

In [None]:
# Distribution of Length of Stay
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
plt.hist(df['LengthOfStay'], edgecolor='black', alpha=0.7)
plt.xlabel('Length of Stay (days)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title('Distribution of Length of Stay', fontsize=14, fontweight='bold')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Mixed Effects of Hospital
per_facility_LOS_df =df[['PermanentFacilityId', 'LengthOfStay']].groupby('PermanentFacilityId').agg(['mean', 'count']).sort_values(by=('LengthOfStay', 'mean'), ascending=False)
plt.hist(df['LengthOfStay'], edgecolor='black', alpha=0.7)


In [None]:
y = df["LengthOfStay"]
X = df.drop(columns=["LengthOfStay"])
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    train_size=TRAIN_SIZE,
    random_state=SEED
)


In [None]:
model = LGBMRegressor(
    n_estimators=500,
    learning_rate=0.05,
    random_state=SEED
)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("R²:", r2_score(y_test, y_pred))
print("RMSE:", root_mean_squared_error(y_test, y_pred))

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.3, s=10)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2, label='Perfect Prediction')
plt.xlabel('Actual log(Length of Stay)', fontsize=12)
plt.ylabel('Predicted log(Length of Stay)', fontsize=12)
plt.title('Actual vs Predicted Length of Stay', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
R²: 0.4810730988247057
RMSE: 5.824811248622107