In [1]:
# Importing the necessary libraries
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import RandomizedSearchCV

In [2]:
hf_data = pd.read_csv('../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')

In [3]:
# Instantiating the models
cbc = CatBoostClassifier()
clf = RandomForestClassifier()

In [4]:
# Splitting the datasets into x and y
x = hf_data.drop(['DEATH_EVENT'], axis = 1)
y = hf_data['DEATH_EVENT']

In [5]:
# Feature Engineering for the RandomForestClassifier model
# Obtaining the best hyper-parameters
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None]

# Parameter grid
param_grid = {'n_estimators': n_estimators, 'min_samples_leaf': min_samples_leaf, 'min_samples_split': min_samples_split, 'max_features': max_features}

# Random Search for hyper-parameter optimization
rscv1 = RandomizedSearchCV(clf, param_grid, cv = 5, n_iter = 10, scoring = 'accuracy', n_jobs = -1, verbose = 1, random_state = 1)

# Fitting the model
rscv1.fit(x, y)

# Obtaining the best parameters
print(rscv1.best_params_)

# Obtaining the best features in decreasing order
feat_imp = rscv1.best_estimator_.feature_importances_
lis = list(x)
sorted(zip(feat_imp, lis), reverse = True)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   17.4s


{'n_estimators': 50, 'min_samples_split': 7, 'min_samples_leaf': 6, 'max_features': 'log2'}


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   19.7s finished


[(0.4601272551200512, 'time'),
 (0.15532510860805732, 'serum_creatinine'),
 (0.11615540388479823, 'ejection_fraction'),
 (0.09136788282395537, 'age'),
 (0.054817368981045084, 'serum_sodium'),
 (0.04834831448015749, 'platelets'),
 (0.03969204514566431, 'creatinine_phosphokinase'),
 (0.012393166398357134, 'high_blood_pressure'),
 (0.00869927999718021, 'anaemia'),
 (0.0051201330091735575, 'sex'),
 (0.004227759024862129, 'diabetes'),
 (0.003726282526698087, 'smoking')]

In [6]:
# Feature Engineering for the CatBoostClassifier model
# Obtaining the best hyper-parameters
n_estimators = [50, 100, 300, 500, 1000]
learning_rate = [0.1, 0.01, 0.001]
max_depth = [1, 2, 4, 6, 8]
reg_lambda = [1, 2, 3, 4, 5]
od_wait = [50, 100, 150]

# Parameter grid
param_grid = {'n_estimators': n_estimators, 'learning_rate': learning_rate, 'max_depth': max_depth, 'reg_lambda': reg_lambda}

# Random Search for hyper-parameter optimization
rscv = RandomizedSearchCV(cbc, param_grid, cv = 5, n_iter = 10, scoring = 'accuracy', n_jobs = -1, verbose = 1, random_state = 1)

# Fitting the model
rscv.fit(x, y)

# Obtaining the best estimators
print(rscv.best_params_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   23.1s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   25.4s finished


0:	learn: 0.6926002	total: 48.9ms	remaining: 24.4s
1:	learn: 0.6917862	total: 50ms	remaining: 12.4s
2:	learn: 0.6911684	total: 50.8ms	remaining: 8.41s
3:	learn: 0.6902732	total: 51.5ms	remaining: 6.39s
4:	learn: 0.6896367	total: 52.3ms	remaining: 5.17s
5:	learn: 0.6891341	total: 53ms	remaining: 4.36s
6:	learn: 0.6884925	total: 53.7ms	remaining: 3.78s
7:	learn: 0.6879174	total: 54.4ms	remaining: 3.35s
8:	learn: 0.6872427	total: 55.2ms	remaining: 3.01s
9:	learn: 0.6864232	total: 55.9ms	remaining: 2.74s
10:	learn: 0.6857411	total: 56.7ms	remaining: 2.52s
11:	learn: 0.6849600	total: 57.4ms	remaining: 2.33s
12:	learn: 0.6843591	total: 58.1ms	remaining: 2.18s
13:	learn: 0.6837603	total: 58.8ms	remaining: 2.04s
14:	learn: 0.6831318	total: 59.5ms	remaining: 1.92s
15:	learn: 0.6823929	total: 60.3ms	remaining: 1.82s
16:	learn: 0.6816147	total: 61ms	remaining: 1.73s
17:	learn: 0.6811756	total: 61.7ms	remaining: 1.65s
18:	learn: 0.6805009	total: 62.4ms	remaining: 1.58s
19:	learn: 0.6798385	total: 

233:	learn: 0.5600286	total: 227ms	remaining: 258ms
234:	learn: 0.5596663	total: 228ms	remaining: 257ms
235:	learn: 0.5591974	total: 229ms	remaining: 256ms
236:	learn: 0.5586360	total: 230ms	remaining: 255ms
237:	learn: 0.5581286	total: 230ms	remaining: 254ms
238:	learn: 0.5576698	total: 231ms	remaining: 252ms
239:	learn: 0.5572105	total: 232ms	remaining: 251ms
240:	learn: 0.5567262	total: 233ms	remaining: 250ms
241:	learn: 0.5563584	total: 233ms	remaining: 249ms
242:	learn: 0.5558753	total: 234ms	remaining: 248ms
243:	learn: 0.5553962	total: 235ms	remaining: 246ms
244:	learn: 0.5549498	total: 236ms	remaining: 245ms
245:	learn: 0.5544530	total: 236ms	remaining: 244ms
246:	learn: 0.5540628	total: 237ms	remaining: 243ms
247:	learn: 0.5536349	total: 238ms	remaining: 242ms
248:	learn: 0.5531533	total: 238ms	remaining: 240ms
249:	learn: 0.5527447	total: 239ms	remaining: 239ms
250:	learn: 0.5524214	total: 240ms	remaining: 238ms
251:	learn: 0.5518998	total: 241ms	remaining: 237ms
252:	learn: 

473:	learn: 0.4718215	total: 405ms	remaining: 22.2ms
474:	learn: 0.4715994	total: 406ms	remaining: 21.4ms
475:	learn: 0.4713828	total: 407ms	remaining: 20.5ms
476:	learn: 0.4710687	total: 408ms	remaining: 19.7ms
477:	learn: 0.4707654	total: 409ms	remaining: 18.8ms
478:	learn: 0.4703852	total: 409ms	remaining: 17.9ms
479:	learn: 0.4701459	total: 410ms	remaining: 17.1ms
480:	learn: 0.4698140	total: 411ms	remaining: 16.2ms
481:	learn: 0.4695256	total: 412ms	remaining: 15.4ms
482:	learn: 0.4692002	total: 412ms	remaining: 14.5ms
483:	learn: 0.4689177	total: 413ms	remaining: 13.7ms
484:	learn: 0.4686204	total: 414ms	remaining: 12.8ms
485:	learn: 0.4682949	total: 415ms	remaining: 11.9ms
486:	learn: 0.4680674	total: 415ms	remaining: 11.1ms
487:	learn: 0.4677469	total: 416ms	remaining: 10.2ms
488:	learn: 0.4675031	total: 417ms	remaining: 9.38ms
489:	learn: 0.4671111	total: 418ms	remaining: 8.53ms
490:	learn: 0.4668368	total: 419ms	remaining: 7.67ms
491:	learn: 0.4665298	total: 419ms	remaining: 

In [7]:
# Re-instantiating the models with the best hyper-parameters obtained
cbc = CatBoostClassifier(reg_lambda = 4, n_estimators = 500, max_depth = 4, learning_rate = 0.001)
clf = RandomForestClassifier(n_estimators = 50, min_samples_split = 7, min_samples_leaf = 6, max_features = 'log2')

In [8]:
# Instantiating the KFold split  
split = KFold(n_splits=10, shuffle = True, random_state=42)

# Split data into train and test using stratified split to avoid bias
for train_index, test_index in split.split(x, y):
    train = hf_data.loc[train_index]
    test = hf_data.loc[test_index]

In [9]:
# Taking only the best features obtained
xtrain = train[['time', 'ejection_fraction', 'serum_creatinine', 'age', 'serum_sodium', 'high_blood_pressure', 'creatinine_phosphokinase']]
ytrain = train['DEATH_EVENT']
xtest = test[['time', 'ejection_fraction', 'serum_creatinine', 'age', 'serum_sodium', 'high_blood_pressure', 'creatinine_phosphokinase']]
ytest = test['DEATH_EVENT']

In [10]:
# Fitting each model
cbc.fit(xtrain, ytrain) # CatBoostClassifier
clf.fit(xtrain, ytrain) # RandomForestClassifier

0:	learn: 0.6925227	total: 848us	remaining: 424ms
1:	learn: 0.6919526	total: 1.96ms	remaining: 487ms
2:	learn: 0.6913832	total: 2.81ms	remaining: 466ms
3:	learn: 0.6907210	total: 3.55ms	remaining: 440ms
4:	learn: 0.6899161	total: 4.3ms	remaining: 426ms
5:	learn: 0.6892100	total: 5.06ms	remaining: 417ms
6:	learn: 0.6884943	total: 6.03ms	remaining: 425ms
7:	learn: 0.6879882	total: 6.75ms	remaining: 415ms
8:	learn: 0.6873955	total: 7.5ms	remaining: 409ms
9:	learn: 0.6868130	total: 8.25ms	remaining: 404ms
10:	learn: 0.6860281	total: 9.01ms	remaining: 401ms
11:	learn: 0.6853275	total: 9.76ms	remaining: 397ms
12:	learn: 0.6847338	total: 10.5ms	remaining: 393ms
13:	learn: 0.6840158	total: 11.3ms	remaining: 393ms
14:	learn: 0.6832304	total: 12.1ms	remaining: 391ms
15:	learn: 0.6825819	total: 12.9ms	remaining: 390ms
16:	learn: 0.6820937	total: 13.6ms	remaining: 387ms
17:	learn: 0.6813629	total: 14.5ms	remaining: 388ms
18:	learn: 0.6807947	total: 15.2ms	remaining: 385ms
19:	learn: 0.6801556	tota

236:	learn: 0.5639749	total: 178ms	remaining: 197ms
237:	learn: 0.5634606	total: 178ms	remaining: 196ms
238:	learn: 0.5629603	total: 179ms	remaining: 196ms
239:	learn: 0.5624298	total: 180ms	remaining: 195ms
240:	learn: 0.5619139	total: 181ms	remaining: 194ms
241:	learn: 0.5615849	total: 182ms	remaining: 194ms
242:	learn: 0.5610252	total: 182ms	remaining: 193ms
243:	learn: 0.5607145	total: 183ms	remaining: 192ms
244:	learn: 0.5605620	total: 184ms	remaining: 191ms
245:	learn: 0.5600947	total: 184ms	remaining: 190ms
246:	learn: 0.5597853	total: 185ms	remaining: 190ms
247:	learn: 0.5593951	total: 186ms	remaining: 189ms
248:	learn: 0.5589399	total: 186ms	remaining: 188ms
249:	learn: 0.5586056	total: 187ms	remaining: 187ms
250:	learn: 0.5582489	total: 188ms	remaining: 186ms
251:	learn: 0.5577941	total: 189ms	remaining: 186ms
252:	learn: 0.5572592	total: 189ms	remaining: 185ms
253:	learn: 0.5568856	total: 190ms	remaining: 184ms
254:	learn: 0.5564561	total: 191ms	remaining: 183ms
255:	learn: 

481:	learn: 0.4788314	total: 355ms	remaining: 13.3ms
482:	learn: 0.4785257	total: 356ms	remaining: 12.5ms
483:	learn: 0.4781609	total: 357ms	remaining: 11.8ms
484:	learn: 0.4778794	total: 358ms	remaining: 11.1ms
485:	learn: 0.4775800	total: 358ms	remaining: 10.3ms
486:	learn: 0.4773290	total: 359ms	remaining: 9.59ms
487:	learn: 0.4769774	total: 360ms	remaining: 8.85ms
488:	learn: 0.4767076	total: 361ms	remaining: 8.11ms
489:	learn: 0.4764367	total: 361ms	remaining: 7.37ms
490:	learn: 0.4762201	total: 362ms	remaining: 6.64ms
491:	learn: 0.4758617	total: 363ms	remaining: 5.9ms
492:	learn: 0.4755480	total: 364ms	remaining: 5.16ms
493:	learn: 0.4753710	total: 364ms	remaining: 4.42ms
494:	learn: 0.4750380	total: 365ms	remaining: 3.69ms
495:	learn: 0.4747791	total: 366ms	remaining: 2.95ms
496:	learn: 0.4745337	total: 366ms	remaining: 2.21ms
497:	learn: 0.4742562	total: 367ms	remaining: 1.47ms
498:	learn: 0.4739253	total: 368ms	remaining: 736us
499:	learn: 0.4736775	total: 368ms	remaining: 0u

RandomForestClassifier(max_features='log2', min_samples_leaf=6,
                       min_samples_split=7, n_estimators=50)

In [11]:
# Obtaining the predictions for both models
cbc_pred = cbc.predict(xtest) # CatBoostClassifier Predictions
clf_pred = clf.predict(xtest) # RandomForestClassifier Predictions

In [12]:
# Obtaining the Accuracy Score of both models
print('Accuracy score for CatBoostClassifier = {}'.format(accuracy_score(ytest, cbc_pred)))
print('Accuracy score for RandomForestClassifier = {}'.format(accuracy_score(ytest, clf_pred)))

Accuracy score for CatBoostClassifier = 0.9310344827586207
Accuracy score for RandomForestClassifier = 0.9655172413793104


In [13]:
# Obtaining the F1 Score of both models
print('F1 Score for CatBoostClassifier = {}'.format(f1_score(ytest, cbc_pred)))
print('F1 Score for RandomForestClassifier = {}'.format(f1_score(ytest, clf_pred)))

F1 Score for CatBoostClassifier = 0.8000000000000002
F1 Score for RandomForestClassifier = 0.9090909090909091


In [14]:
#Obtaining the Confusion Matrix of both models
print('Confusion Matrix for CatBoostClassifier = \n{}'.format(confusion_matrix(ytest, cbc_pred)))
print('Confusion Matrix for RandomForestClassifier = \n{}'.format(confusion_matrix(ytest, clf_pred)))

Confusion Matrix for CatBoostClassifier = 
[[23  1]
 [ 1  4]]
Confusion Matrix for RandomForestClassifier = 
[[23  1]
 [ 0  5]]
