## Multiple Chronic Conditions Machine Learning Modeling

### Import relevant libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import hvplot.pandas

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score, mean_absolute_error

In [2]:
import lightgbm

In [3]:
print(lightgbm.__version__)

4.3.0


### Load MCC Datasets

In [4]:
# Load MCC specific data
mcc_df = pd.read_csv("ml_processed_mcc_data_2017_2018.csv")

### Data Exploration

In [5]:
mcc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5858 entries, 0 to 5857
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Bene_Geo_Lvl              5858 non-null   object 
 1   Bene_Geo_Desc             5858 non-null   object 
 2   Bene_Geo_Cd               5786 non-null   float64
 3   Bene_Age_Lvl              5858 non-null   object 
 4   Bene_Demo_Lvl             5858 non-null   object 
 5   Bene_Demo_Desc            5858 non-null   object 
 6   Bene_MCC                  5858 non-null   object 
 7   Prvlnc                    5858 non-null   float64
 8   Tot_Mdcr_Stdzd_Pymt_PC    5858 non-null   float64
 9   Tot_Mdcr_Pymt_PC          5858 non-null   float64
 10  Hosp_Readmsn_Rate         5858 non-null   float64
 11  ER_Visits_Per_1000_Benes  5710 non-null   float64
 12  year                      5858 non-null   int64  
 13  Age_Group                 5858 non-null   int64  
 14  Sex     

In [6]:
mcc_df.head()

Unnamed: 0,Bene_Geo_Lvl,Bene_Geo_Desc,Bene_Geo_Cd,Bene_Age_Lvl,Bene_Demo_Lvl,Bene_Demo_Desc,Bene_MCC,Prvlnc,Tot_Mdcr_Stdzd_Pymt_PC,Tot_Mdcr_Pymt_PC,...,year,Age_Group,Sex,Medicare_Type,Non-Hispanic_White,Non-Hispanic_Black,Hispanic,Asian_Pacific_Islander,Native_American,MCC_Group
0,State,Alabama,1.0,65+,Dual Status,Medicare Only,0 to 1,0.2473,1982.6565,1790.6022,...,2017,1,-1,0,-1,-1,-1,-1,-1,0
1,State,Alabama,1.0,65+,Dual Status,Medicare Only,2 to 3,0.3028,5113.1152,4665.4711,...,2017,1,-1,0,-1,-1,-1,-1,-1,1
2,State,Alabama,1.0,65+,Dual Status,Medicare Only,4 to 5,0.2578,9650.4733,8806.6521,...,2017,1,-1,0,-1,-1,-1,-1,-1,2
3,State,Alabama,1.0,65+,Dual Status,Medicare Only,6+,0.1921,26061.123,23776.419,...,2017,1,-1,0,-1,-1,-1,-1,-1,3
4,State,Alabama,1.0,65+,Dual Status,Medicare and Medicaid,2 to 3,0.2236,5864.6106,5306.2806,...,2017,1,-1,1,-1,-1,-1,-1,-1,1


### ML Modeling

#### Create data splits

In [7]:
X = mcc_df.drop('Hosp_Readmsn_Rate', axis=1).copy()
y = mcc_df['Hosp_Readmsn_Rate'].copy()

In [8]:
# Create main split of data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

In [9]:
# Create second split using initial training data
XX_train, XX_test, yy_train, yy_test = train_test_split(X_train, y_train, test_size=0.15, random_state=50)

In [10]:
X_train.shape

(4686, 21)

In [11]:
XX_train.shape

(3983, 21)

#### Define input features

In [12]:
ml_input_feats = ['Age_Group', 'Sex', 'Medicare_Type', 'Non-Hispanic_White', 'Non-Hispanic_Black', 'Hispanic', 'Native_American', 'MCC_Group']

#### LightGBM Model

##### Build base model

In [15]:
lgbm_model = LGBMRegressor(random_state=50, force_row_wise=True, verbose=-1)

In [16]:
%%time
lgbm_model.fit(XX_train[ml_input_feats].fillna(-1),yy_train)

CPU times: user 706 ms, sys: 230 ms, total: 936 ms
Wall time: 97.2 ms


In [20]:
yy_pred_lgbm = lgbm_model.predict(XX_test[ml_input_feats])

##### Assess base model

In [21]:
r2_lgbm = np.round(r2_score(yy_test,yy_pred_lgbm),4)
r2_lgbm

0.8966

In [24]:
mae_lgbm = np.round(mean_absolute_error(yy_test, yy_pred_lgbm),4)
mae_lgbm

0.0174

In [25]:
np.round(np.corrcoef(yy_test, yy_pred_lgbm)[0,1],4)

0.9469

In [30]:
lgbm_model.feature_importances_/np.sum(lgbm_model.feature_importances_)

array([0.124     , 0.12666667, 0.16133333, 0.15466667, 0.09066667,
       0.05566667, 0.092     , 0.195     ])

In [27]:
np.array(ml_input_feats)

array(['Age_Group', 'Sex', 'Medicare_Type', 'Non-Hispanic_White',
       'Non-Hispanic_Black', 'Hispanic', 'Native_American', 'MCC_Group'],
      dtype='<U18')

#### Optimize hyper-parameters

In [101]:
estimator_lgbm = LGBMRegressor(random_state=50, force_row_wise=True, verbose=-1)

In [115]:
from scipy.stats import randint
from scipy.stats import uniform
param_grid ={'n_estimators': [int(x) for x in np.linspace(start = 100, stop = 1000, num = 50)],
             'max_depth': [int(x) for x in np.linspace(3, 30, num = 10)] + [None],
             'num_leaves': [int(x) for x in np.linspace(start = 500, stop = 2000, num = 250)], 
             'min_child_samples': randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100],
             'feature_fraction': [0.7, 0.75, 0.8, 0.85, 0.9, 0.95],
             'bagging_fraction': [0.7, 0.75, 0.8, 0.85, 0.9, 0.95]}

In [116]:
lgbm_rnd = RandomizedSearchCV(estimator = estimator_lgbm, param_distributions = param_grid, n_iter = 1000,
                              cv = 3, verbose=0, random_state=50, n_jobs = -1, scoring='neg_mean_absolute_error')

In [117]:
%%time
lgbm_rnd.fit(XX_train[ml_input_feats], yy_train)

CPU times: user 6.03 s, sys: 2.69 s, total: 8.72 s
Wall time: 19min 26s


In [118]:
lgbm_rnd.best_score_

-0.01791887448403547

In [119]:
lgbm_rnd.best_params_

{'bagging_fraction': 0.7,
 'colsample_bytree': 0.8300182737501955,
 'feature_fraction': 0.85,
 'max_depth': 9,
 'min_child_samples': 107,
 'min_child_weight': 1,
 'n_estimators': 908,
 'num_leaves': 1626,
 'reg_alpha': 0,
 'reg_lambda': 20,
 'subsample': 0.5152617947907717}

##### Build new model using optimized hyper-parameters

In [120]:
lgbm_model2 = LGBMRegressor(**lgbm_rnd.best_params_,random_state=50, force_row_wise=True, verbose=-1)

In [121]:
%%time
lgbm_model2.fit(XX_train[ml_input_feats], yy_train)

CPU times: user 4.16 s, sys: 1.22 s, total: 5.37 s
Wall time: 470 ms


In [122]:
yy_pred_lgbm2 = lgbm_model2.predict(XX_test[ml_input_feats])

##### Assess optimized model

In [123]:
r2_lgbm2 = np.round(r2_score(yy_test, yy_pred_lgbm2),4)
r2_lgbm2

0.8966

In [124]:
mae_lgbm2 = np.round(mean_absolute_error(yy_test, yy_pred_lgbm2),4)
mae_lgbm2

0.0174

In [125]:
np.round(np.corrcoef(yy_test, yy_pred_lgbm2)[0,1],4)

0.947

In [126]:
lgbm_model2.feature_importances_/np.sum(lgbm_model2.feature_importances_)

array([0.11985578, 0.19356168, 0.08452228, 0.21560649, 0.07571465,
       0.04331702, 0.03095545, 0.23646665])

In [127]:
np.array(ml_input_feats)

array(['Age_Group', 'Sex', 'Medicare_Type', 'Non-Hispanic_White',
       'Non-Hispanic_Black', 'Hispanic', 'Native_American', 'MCC_Group'],
      dtype='<U18')