In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
import xgboost as xgb

In [3]:
df = pd.read_csv('../data/processed_data.csv')

In [4]:
df.head()

Unnamed: 0,anchor_age,los,drg_severity,drg_mortality,ed_los,is_weekend,seq_num_2.0,seq_num_3.0,seq_num_4.0,seq_num_6.0,...,in_month_May,in_month_November,in_month_October,in_month_September,in_day_Monday,in_day_Saturday,in_day_Sunday,in_day_Thursday,in_day_Tuesday,in_day_Wednesday
0,71,2.301308,2.0,2.0,658.0,0,False,False,False,False,...,True,False,False,False,True,False,False,False,False,False
1,58,1.89375,2.0,1.0,372.0,0,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
2,48,17.376539,4.0,3.0,384.0,0,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
3,55,13.884815,4.0,3.0,183.0,0,False,True,False,False,...,False,False,False,False,False,False,False,False,False,True
4,85,29.218449,3.0,3.0,212.0,1,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [5]:
df.dtypes

anchor_age            int64
los                 float64
drg_severity        float64
drg_mortality       float64
ed_los              float64
                     ...   
in_day_Saturday        bool
in_day_Sunday          bool
in_day_Thursday        bool
in_day_Tuesday         bool
in_day_Wednesday       bool
Length: 135, dtype: object

In [6]:
# change drg_severity and drg_mortality to categorical
df['drg_severity'] = df['drg_severity'].astype('category')
df['drg_mortality'] = df['drg_mortality'].astype('category')

In [7]:
# Avoid log(0) issues
df['log_age'] = np.log(df['anchor_age'] + 1)
df['log_ed_los'] = np.log(df['ed_los'] + 1)

#df['age_squared'] = df['anchor_age'] ** 2
#df['ed_los_squared'] = df['ed_los'] ** 2

df.drop(columns=['anchor_age', 'ed_los'], inplace=True)

In [8]:
#scale numerical features
from sklearn.preprocessing import StandardScaler

numerical_features = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
numerical_features.remove('los')
numerical_features.remove('is_weekend')

In [9]:
numerical_features

['log_age', 'log_ed_los']

In [10]:
Scaler = StandardScaler()
df[numerical_features] = Scaler.fit_transform(df[numerical_features])

In [11]:
X = df.drop(columns=['los'])  # Features
y = df['los']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
X_train.shape

(161, 134)

In [232]:
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=5, enable_categorical=True)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 73.23429566540919
R^2 Score: 0.16557116923439785


In [233]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score


In [234]:
def test_models(X_train, X_test, y_train, y_test):
    models = {
        'Linear Regression (Multi)': LinearRegression(),
        'Ridge Regression': Ridge(),
        'Lasso Regression': Lasso(),
        'Random Forest': RandomForestRegressor(),
        'Gradient Boosting': GradientBoostingRegressor()
    }
    
    results = {}
    
    for name, model in models.items():
        # Fit the model
        model.fit(X_train, y_train)
        
        # Make predictions on the test set
        y_pred = model.predict(X_test)
        
        # Calculate RMSE and R^2
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r_squared = r2_score(y_test, y_pred)
        
        # Store the results
        results[name] = {'RMSE': rmse, 'R^2': r_squared}
        
        # Print the results
        print(f"{name}: RMSE = {rmse:.2f}, R^2 = {r_squared:.2f}")
        
    return results

# Run the test function with your dataset
results = test_models(X_train, X_test, y_train, y_test)

Linear Regression (Multi): RMSE = 13.18, R^2 = -0.98
Ridge Regression: RMSE = 9.54, R^2 = -0.04
Lasso Regression: RMSE = 8.97, R^2 = 0.08
Random Forest: RMSE = 9.02, R^2 = 0.07
Gradient Boosting: RMSE = 8.56, R^2 = 0.17


In [235]:
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline

# 1. Define pipeline
pipeline = Pipeline([
    ('select', SelectKBest(score_func=f_regression)),
    ('model', model)
])
# Define parameter grid
param_grid = {
    'select__k': [40, 55, 75, 100],
    'model__n_estimators': [100,150, 200],
    'model__max_depth': [ 3,4, 5, 7],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__subsample': [0.8, 1.0]
}

# Define scoring function (neg_root_mean_squared_error preferred for regression)
#scorer = make_scorer(mean_squared_error, greater_is_better=False, squared=False)

# Set up GridSearch
grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    scoring='r2',  # Use 'neg_root_mean_squared_error' or R²
    cv=5,
    n_jobs=-1,
    verbose=1
)

# Fit grid search
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)
print("Best Score (R^2):", grid_search.best_score_)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits
Best Parameters: {'model__learning_rate': 0.01, 'model__max_depth': 3, 'model__n_estimators': 200, 'model__subsample': 0.8, 'select__k': 100}
Best Score (R^2): 0.2215035202460506


In [236]:
# 3. Evaluate best model on the true test set
r2_test = best_model.score(X_test, y_test)

print("Final R² on test set:", r2_test)

Final R² on test set: 0.16977612542924259


In [237]:
# Try SVM models
from sklearn.svm import SVR
# 3. Pipeline: standardization + SVR
pipeline = Pipeline([
    ('select',SelectKBest(score_func=f_regression)),
    ('svr', SVR())
])

# 4. Define parameter grid for SVR
param_grid = {
    'select__k': [40, 55, 75, 100, 'all'],
    'svr__C': [0.1, 1, 10],
    'svr__epsilon': [0.1, 0.2, 0.5],
    'svr__kernel': ['rbf', 'linear', 'poly'],
    'svr__gamma': ['scale', 'auto'],
    'svr__degree': [2, 3]  # Only used for poly kernel
}

# 5. GridSearchCV with 5-fold CV and R² scoring
grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)

# 6. Fit the model
grid.fit(X_train, y_train)

# 7. Evaluate
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
r2 = r2_score(y_test, y_pred)

print("Best Parameters:", grid.best_params_)
print("Test R² Score:", r2)

Fitting 5 folds for each of 540 candidates, totalling 2700 fits
Best Parameters: {'select__k': 'all', 'svr__C': 1, 'svr__degree': 2, 'svr__epsilon': 0.1, 'svr__gamma': 'scale', 'svr__kernel': 'linear'}
Test R² Score: 0.24080891877271104


In [238]:
df.head()

Unnamed: 0,los,drg_severity,drg_mortality,is_weekend,seq_num_2.0,seq_num_3.0,seq_num_4.0,seq_num_6.0,seq_num_7.0,seq_num_9.0,...,in_month_October,in_month_September,in_day_Monday,in_day_Saturday,in_day_Sunday,in_day_Thursday,in_day_Tuesday,in_day_Wednesday,log_age,log_ed_los
0,2.301308,2.0,2.0,0,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,0.698252,1.064276
1,1.89375,2.0,1.0,0,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,0.190845,0.569386
2,17.376539,4.0,3.0,0,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,-0.282387,0.59692
3,13.884815,4.0,3.0,0,False,True,False,False,False,False,...,False,False,False,False,False,False,False,True,0.057869,-0.045061
4,29.218449,3.0,3.0,1,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,1.151007,0.0822


In [239]:
df.columns

Index(['los', 'drg_severity', 'drg_mortality', 'is_weekend', 'seq_num_2.0',
       'seq_num_3.0', 'seq_num_4.0', 'seq_num_6.0', 'seq_num_7.0',
       'seq_num_9.0',
       ...
       'in_month_October', 'in_month_September', 'in_day_Monday',
       'in_day_Saturday', 'in_day_Sunday', 'in_day_Thursday', 'in_day_Tuesday',
       'in_day_Wednesday', 'log_age', 'log_ed_los'],
      dtype='object', length=135)

In [241]:
import joblib
joblib.dump(best_model, '../models/svr_model.pkl')

['../models/svr_model.pkl']