# Training model to predict patient legnth of stay

## Imports

In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

## Reading Dataset

In [21]:
# read in dataset
df = pd.read_csv("LengthOfStay.csv")
df.head()

Unnamed: 0,eid,vdate,rcount,gender,dialysisrenalendstage,asthma,irondef,pneum,substancedependence,psychologicaldisordermajor,...,glucose,bloodureanitro,creatinine,bmi,pulse,respiration,secondarydiagnosisnonicd9,discharged,facid,lengthofstay
0,1,8/29/2012,0,F,0,0,0,0,0,0,...,192.476918,12.0,1.390722,30.432418,96,6.5,4,9/1/2012,B,3
1,2,5/26/2012,5+,F,0,0,0,0,0,0,...,94.078507,8.0,0.943164,28.460516,61,6.5,1,6/2/2012,A,7
2,3,9/22/2012,1,F,0,0,0,0,0,0,...,130.530524,12.0,1.06575,28.843812,64,6.5,2,9/25/2012,B,3
3,4,8/9/2012,0,F,0,0,0,0,0,0,...,163.377028,12.0,0.906862,27.959007,76,6.5,1,8/10/2012,A,1
4,5,12/20/2012,0,F,0,0,0,1,0,1,...,94.886654,11.5,1.242854,30.258927,67,5.6,2,12/24/2012,E,4


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 28 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   eid                         100000 non-null  int64  
 1   vdate                       100000 non-null  object 
 2   rcount                      100000 non-null  object 
 3   gender                      100000 non-null  object 
 4   dialysisrenalendstage       100000 non-null  int64  
 5   asthma                      100000 non-null  int64  
 6   irondef                     100000 non-null  int64  
 7   pneum                       100000 non-null  int64  
 8   substancedependence         100000 non-null  int64  
 9   psychologicaldisordermajor  100000 non-null  int64  
 10  depress                     100000 non-null  int64  
 11  psychother                  100000 non-null  int64  
 12  fibrosisandother            100000 non-null  int64  
 13  malnutrition   

## Type conversions 

In [24]:
datetime_cols = ["vdate", "discharged"]
cat_cols = ["gender", "rcount"]
bin_cols = ["dialysisrenalendstage", 
            "asthma", 
            "irondef", 
            "pneum", 
            "substancedependence", 
            "psychologicaldisordermajor",
            "depress",
            "psychother",
            "fibrosisandother",
            "malnutrition",
            "hemo"]
num_cols = ["hematocrit",
            "neutrophils",
            "sodium",
            "glucose",
            "bloodureanitro",
            "creatinine",
            "bmi",
            "respiration"]

for date_col, cat_col in zip(datetime_cols, cat_cols): 
    df[date_col] = pd.to_datetime(df[date_col], format="%m/%d/%Y")
    df[cat_col] = df[cat_col].astype("category")
df.head()

Unnamed: 0,eid,vdate,rcount,gender,dialysisrenalendstage,asthma,irondef,pneum,substancedependence,psychologicaldisordermajor,...,glucose,bloodureanitro,creatinine,bmi,pulse,respiration,secondarydiagnosisnonicd9,discharged,facid,lengthofstay
0,1,2012-08-29,0,F,0,0,0,0,0,0,...,192.476918,12.0,1.390722,30.432418,96,6.5,4,2012-09-01,B,3
1,2,2012-05-26,5+,F,0,0,0,0,0,0,...,94.078507,8.0,0.943164,28.460516,61,6.5,1,2012-06-02,A,7
2,3,2012-09-22,1,F,0,0,0,0,0,0,...,130.530524,12.0,1.06575,28.843812,64,6.5,2,2012-09-25,B,3
3,4,2012-08-09,0,F,0,0,0,0,0,0,...,163.377028,12.0,0.906862,27.959007,76,6.5,1,2012-08-10,A,1
4,5,2012-12-20,0,F,0,0,0,1,0,1,...,94.886654,11.5,1.242854,30.258927,67,5.6,2,2012-12-24,E,4


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 28 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   eid                         100000 non-null  int64         
 1   vdate                       100000 non-null  datetime64[ns]
 2   rcount                      100000 non-null  category      
 3   gender                      100000 non-null  category      
 4   dialysisrenalendstage       100000 non-null  int64         
 5   asthma                      100000 non-null  int64         
 6   irondef                     100000 non-null  int64         
 7   pneum                       100000 non-null  int64         
 8   substancedependence         100000 non-null  int64         
 9   psychologicaldisordermajor  100000 non-null  int64         
 10  depress                     100000 non-null  int64         
 11  psychother                  100000 non-n

## New Feature

In [27]:
# number_of_issues transform function
def calculate_number_of_issues(df, bin_cols):
    df["numberofissues"] = df[bin_cols].sum(axis=1)
    return df

df = calculate_number_of_issues(df, bin_cols)

In [28]:
labels, features = df[["lengthofstay"]], df.drop(["lengthofstay", "discharged", "vdate", "eid", "facid"], axis=1)
labels.value_counts()

lengthofstay
1               17979
3               16068
4               14822
2               12825
5               12116
6               10362
7                7263
8                4652
9                2184
10               1000
11                460
12                137
13                 75
14                 31
15                 16
16                  6
17                  4
Name: count, dtype: int64

In [29]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 24 columns):
 #   Column                      Non-Null Count   Dtype   
---  ------                      --------------   -----   
 0   rcount                      100000 non-null  category
 1   gender                      100000 non-null  category
 2   dialysisrenalendstage       100000 non-null  int64   
 3   asthma                      100000 non-null  int64   
 4   irondef                     100000 non-null  int64   
 5   pneum                       100000 non-null  int64   
 6   substancedependence         100000 non-null  int64   
 7   psychologicaldisordermajor  100000 non-null  int64   
 8   depress                     100000 non-null  int64   
 9   psychother                  100000 non-null  int64   
 10  fibrosisandother            100000 non-null  int64   
 11  malnutrition                100000 non-null  int64   
 12  hemo                        100000 non-null  int64   
 13  

## Preprocessing

In [31]:
# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), num_cols),
        ("cat", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols),
        ("bin", "passthrough", bin_cols + ["numberofissues"])  # Pass through binary columns and numberofissues
    ]
)

In [32]:
# Split the data
X_train, X_temp, y_train, y_temp = train_test_split(features, labels, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [33]:
# Create a pipeline with preprocessing and the model
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", XGBRegressor(objective="reg:squarederror", n_estimators=100, learning_rate=0.1, random_state=42))
])

## Training model

In [35]:
# Train the model
model.fit(X_train, y_train)

## Evaluation

In [37]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Predict on validation set
y_val_pred = model.predict(X_val)

# Calculate metrics
mae = mean_absolute_error(y_val, y_val_pred)
mse = mean_squared_error(y_val, y_val_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_val, y_val_pred)

# Print results
print(f"Validation Mean Absolute Error (MAE): {mae}")
print(f"Validation Mean Squared Error (MSE): {mse}")
print(f"Validation Root Mean Squared Error (RMSE): {rmse}")
print(f"Validation R-squared (R²): {r2}")

Validation Mean Absolute Error (MAE): 0.37675270438194275
Validation Mean Squared Error (MSE): 0.2869267463684082
Validation Root Mean Squared Error (RMSE): 0.5356554362352801
Validation R-squared (R²): 0.9486444592475891


In [38]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    "regressor__n_estimators": [50, 100, 200],
    "regressor__learning_rate": [0.01, 0.1, 0.2],
    "regressor__max_depth": [3, 5, 7]
}

# Perform grid search
grid_search = GridSearchCV(model, param_grid, cv=3, scoring="neg_mean_absolute_error", n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters
print(f"Best Parameters: {grid_search.best_params_}")

# Evaluate best model on validation set
best_model = grid_search.best_estimator_
y_val_pred_best = best_model.predict(X_val)
mae_best = mean_absolute_error(y_val, y_val_pred_best)

# Calculate metrics
mse = mean_squared_error(y_val, y_val_pred_best)
rmse = np.sqrt(mse)
r2 = r2_score(y_val, y_val_pred_best)

# Print results
print(f"Validation Mean Absolute Error (MAE): {mae_best}")
print(f"Validation Mean Squared Error (MSE): {mse}")
print(f"Validation Root Mean Squared Error (RMSE): {rmse}")
print(f"Validation R-squared (R²): {r2}")

Best Parameters: {'regressor__learning_rate': 0.1, 'regressor__max_depth': 7, 'regressor__n_estimators': 200}
Validation Mean Absolute Error (MAE): 0.3733857274055481
Validation Mean Squared Error (MSE): 0.29183104634284973
Validation Root Mean Squared Error (RMSE): 0.5402138894390348
Validation R-squared (R²): 0.9477666616439819


In [39]:
# Predict on test set
y_test_pred = best_model.predict(X_test)

# Calculate MAE
test_mae = mean_absolute_error(y_test, y_test_pred)
print(f"Test Mean Absolute Error: {test_mae}")

Test Mean Absolute Error: 0.38201063871383667


In [40]:
import joblib

# Save the model
joblib.dump(best_model, "length_of_stay_model.pkl")

['length_of_stay_model.pkl']