## Multiple Chronic Conditions Machine Learning Modeling

### Import relevant libraries

In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import hvplot.pandas

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error

##### Exclude XGBOOST and LGBM from Lazypredict assessment

In [2]:
from lazypredict import Supervised
Supervised.REGRESSORS.pop(40)
Supervised.REGRESSORS.pop(40)
LazyRegressor = Supervised.LazyRegressor

### Load MCC Datasets

In [3]:
# Load MCC specific data
mcc_df = pd.read_csv("ml_processed_mcc_data_2017_2018.csv")

### Data Exploration

In [4]:
mcc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5858 entries, 0 to 5857
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Bene_Geo_Lvl              5858 non-null   object 
 1   Bene_Geo_Desc             5858 non-null   object 
 2   Bene_Geo_Cd               5786 non-null   float64
 3   Bene_Age_Lvl              5858 non-null   object 
 4   Bene_Demo_Lvl             5858 non-null   object 
 5   Bene_Demo_Desc            5858 non-null   object 
 6   Bene_MCC                  5858 non-null   object 
 7   Prvlnc                    5858 non-null   float64
 8   Tot_Mdcr_Stdzd_Pymt_PC    5858 non-null   float64
 9   Tot_Mdcr_Pymt_PC          5858 non-null   float64
 10  Hosp_Readmsn_Rate         5858 non-null   float64
 11  ER_Visits_Per_1000_Benes  5710 non-null   float64
 12  year                      5858 non-null   int64  
 13  Age_Group                 5858 non-null   int64  
 14  Sex     

In [5]:
mcc_df.head()

Unnamed: 0,Bene_Geo_Lvl,Bene_Geo_Desc,Bene_Geo_Cd,Bene_Age_Lvl,Bene_Demo_Lvl,Bene_Demo_Desc,Bene_MCC,Prvlnc,Tot_Mdcr_Stdzd_Pymt_PC,Tot_Mdcr_Pymt_PC,...,year,Age_Group,Sex,Medicare_Type,Non-Hispanic_White,Non-Hispanic_Black,Hispanic,Asian_Pacific_Islander,Native_American,MCC_Group
0,State,Alabama,1.0,65+,Dual Status,Medicare Only,0 to 1,0.25,1982.66,1790.6,...,2017,1,-1,0,-1,-1,-1,-1,-1,0
1,State,Alabama,1.0,65+,Dual Status,Medicare Only,2 to 3,0.3,5113.12,4665.47,...,2017,1,-1,0,-1,-1,-1,-1,-1,1
2,State,Alabama,1.0,65+,Dual Status,Medicare Only,4 to 5,0.26,9650.47,8806.65,...,2017,1,-1,0,-1,-1,-1,-1,-1,2
3,State,Alabama,1.0,65+,Dual Status,Medicare Only,6+,0.19,26061.12,23776.42,...,2017,1,-1,0,-1,-1,-1,-1,-1,3
4,State,Alabama,1.0,65+,Dual Status,Medicare and Medicaid,2 to 3,0.22,5864.61,5306.28,...,2017,1,-1,1,-1,-1,-1,-1,-1,1


### ML Modeling

#### Create data splits

In [6]:
X = mcc_df.drop('Hosp_Readmsn_Rate', axis=1).copy()
y = mcc_df['Hosp_Readmsn_Rate'].copy()

In [7]:
# Create main split of data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

In [8]:
# Create second split using initial training data
XX_train, XX_test, yy_train, yy_test = train_test_split(X_train, y_train, test_size=0.15, random_state=50)

In [9]:
X_train.shape

(4686, 21)

In [10]:
XX_train.shape

(3983, 21)

#### Define input features

In [11]:
ml_input_feats = ['Age_Group', 'Sex', 'Medicare_Type', 'Non-Hispanic_White', 'Non-Hispanic_Black', 'Hispanic', 'Native_American', 'MCC_Group']

#### Algorithm selection

In [12]:
reg = LazyRegressor(verbose=-1, ignore_warnings=False, custom_metric=None, random_state=50)

In [13]:
%%time
results, _ = reg.fit(XX_train[ml_input_feats], XX_test[ml_input_feats], yy_train, yy_test)

 18%|█▊        | 7/40 [00:00<00:00, 66.28it/s]

GammaRegressor model failed to execute
Some value(s) of y are out of the valid range of the loss 'HalfGammaLoss'.


 85%|████████▌ | 34/40 [00:03<00:00, 12.47it/s]

QuantileRegressor model failed to execute
Solver interior-point is not anymore available in SciPy >= 1.11.0.


100%|██████████| 40/40 [00:03<00:00, 13.01it/s]

CPU times: user 20.3 s, sys: 11.2 s, total: 31.5 s
Wall time: 3.1 s





In [14]:
results

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HistGradientBoostingRegressor,0.9,0.9,0.03,0.5
ExtraTreesRegressor,0.9,0.9,0.03,0.15
DecisionTreeRegressor,0.9,0.9,0.03,0.0
ExtraTreeRegressor,0.9,0.9,0.03,0.01
GaussianProcessRegressor,0.9,0.9,0.03,0.65
RandomForestRegressor,0.9,0.9,0.03,0.15
BaggingRegressor,0.89,0.9,0.03,0.02
NuSVR,0.89,0.89,0.03,0.54
GradientBoostingRegressor,0.89,0.89,0.03,0.11
MLPRegressor,0.88,0.89,0.03,0.11


###### Selecting the fastest of the top performing algorithms gives Random Forest as the optimal algorithm from the considered list

###### Also, we cannot cast this problem as a classification task because we do not have enough representation for cases greater than 0.5 threshold of readadmission rate

#### Random Forest Model

##### Build base model

In [15]:
rf_model = RandomForestRegressor(random_state=50)

In [16]:
rf_model.fit(XX_train[ml_input_feats],yy_train)

In [17]:
yy_pred_rf = rf_model.predict(XX_test[ml_input_feats])

##### Assess base model

In [99]:
r2_rf = np.round(r2_score(yy_test,yy_pred_rf),4)
r2_rf

0.8965

In [98]:
mae_rf = np.round(mean_absolute_error(yy_test, yy_pred_rf),4)
mae_rf

0.0174

In [97]:
np.round(np.corrcoef(yy_test, yy_pred_rf)[0,1],4)

0.9469

In [24]:
rf_model.feature_importances_

array([0.1155408 , 0.00251644, 0.00879959, 0.00551114, 0.00847762,
       0.00377142, 0.01259826, 0.84278473])

In [26]:
np.array(ml_input_feats)

array(['Age_Group', 'Sex', 'Medicare_Type', 'Non-Hispanic_White',
       'Non-Hispanic_Black', 'Hispanic', 'Native_American', 'MCC_Group'],
      dtype='<U18')

#### Optimize hyper-parameters

In [139]:
estimator_rf = RandomForestRegressor(random_state=50)

In [140]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 50)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(3, 30, num = 10)] + [None]

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 7, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 7]

# Method of selecting samples for training each tree
bootstrap = [True, False]

In [141]:
# Create the random parameter grid
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [142]:
rf_rnd = RandomizedSearchCV(estimator = estimator_rf, param_distributions = param_grid, n_iter = 100000, cv = 3, verbose=0,
                            random_state=50, n_jobs = -1, scoring='neg_mean_absolute_error')

In [143]:
%%time
rf_rnd.fit(XX_train[ml_input_feats], yy_train)

CPU times: user 26.3 s, sys: 6.6 s, total: 32.9 s
Wall time: 17h 4min 3s


In [144]:
rf_rnd.best_score_

-0.017830826169762757

In [145]:
rf_rnd.best_params_

{'n_estimators': 485,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 12,
 'bootstrap': False}

In [146]:
rf_rnd.best_params_

{'n_estimators': 485,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 12,
 'bootstrap': False}

##### Build new model using optimized hyper-parameters

In [147]:
rf_model2 = RandomForestRegressor(**rf_rnd.best_params_,random_state=50)

In [148]:
%%time
rf_model2.fit(XX_train[ml_input_feats], yy_train)

CPU times: user 371 ms, sys: 5.34 ms, total: 377 ms
Wall time: 379 ms


In [149]:
yy_pred_rf2 = rf_model2.predict(XX_test[ml_input_feats])

##### Assess optimized model

In [150]:
r2_rf2 = np.round(r2_score(yy_test, yy_pred_rf2),4)
r2_rf2

0.8965

In [151]:
mae_rf2 = np.round(mean_absolute_error(yy_test, yy_pred_rf2),4)
mae_rf2

0.0174

In [152]:
np.round(np.corrcoef(yy_test, yy_pred_rf2)[0,1],4)

0.9469

In [153]:
rf_model2.feature_importances_

array([0.11344928, 0.00265056, 0.0088521 , 0.00537114, 0.00597519,
       0.00279064, 0.00621391, 0.85469718])

In [154]:
np.array(ml_input_feats)

array(['Age_Group', 'Sex', 'Medicare_Type', 'Non-Hispanic_White',
       'Non-Hispanic_Black', 'Hispanic', 'Native_American', 'MCC_Group'],
      dtype='<U18')