In [1]:
# Importing the necessary libraries
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import RandomizedSearchCV

In [2]:
hf_data = pd.read_csv('../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')

In [3]:
# Instantiating the models
cbc = CatBoostClassifier()
clf = RandomForestClassifier()

In [4]:
# Splitting the datasets into x and y
x = hf_data.drop(['DEATH_EVENT'], axis = 1)
y = hf_data['DEATH_EVENT']

In [5]:
# Feature Engineering for the RandomForestClassifier model
# Obtaining the best hyper-parameters
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None]

# Parameter grid
param_grid = {'n_estimators': n_estimators, 'min_samples_leaf': min_samples_leaf, 'min_samples_split': min_samples_split, 'max_features': max_features}

# Random Search for hyper-parameter optimization
rscv1 = RandomizedSearchCV(clf, param_grid, cv = 5, n_iter = 10, scoring = 'accuracy', n_jobs = -1, verbose = 1, random_state = 1)

# Fitting the model
rscv1.fit(x, y)

# Obtaining the best parameters
print(rscv1.best_params_)

# Obtaining the best features in decreasing order
feat_imp = rscv1.best_estimator_.feature_importances_
lis = list(x)
sorted(zip(feat_imp, lis), reverse = True)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   17.4s


{'n_estimators': 50, 'min_samples_split': 7, 'min_samples_leaf': 6, 'max_features': 'log2'}


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   19.7s finished


[(0.4601272551200512, 'time'),
 (0.15532510860805732, 'serum_creatinine'),
 (0.11615540388479823, 'ejection_fraction'),
 (0.09136788282395537, 'age'),
 (0.054817368981045084, 'serum_sodium'),
 (0.04834831448015749, 'platelets'),
 (0.03969204514566431, 'creatinine_phosphokinase'),
 (0.012393166398357134, 'high_blood_pressure'),
 (0.00869927999718021, 'anaemia'),
 (0.0051201330091735575, 'sex'),
 (0.004227759024862129, 'diabetes'),
 (0.003726282526698087, 'smoking')]

In [6]:
# Feature Engineering for the CatBoostClassifier model
# Obtaining the best hyper-parameters
n_estimators = [50, 100, 300, 500, 1000]
learning_rate = [0.1, 0.01, 0.001]
max_depth = [1, 2, 4, 6, 8]
reg_lambda = [1, 2, 3, 4, 5]
od_wait = [50, 100, 150]

# Parameter grid
param_grid = {'n_estimators': n_estimators, 'learning_rate': learning_rate, 'max_depth': max_depth, 'reg_lambda': reg_lambda}

# Random Search for hyper-parameter optimization
rscv = RandomizedSearchCV(cbc, param_grid, cv = 5, n_iter = 10, scoring = 'accuracy', n_jobs = -1, verbose = 1, random_state = 1)

# Fitting the model
rscv.fit(x, y)

# Obtaining the best estimators
print(rscv.best_params_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   23.1s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   25.4s finished


0:	learn: 0.6926002	total: 48.9ms	remaining: 24.4s
1:	learn: 0.6917862	total: 50ms	remaining: 12.4s
2:	learn: 0.6911684	total: 50.8ms	remaining: 8.41s
3:	learn: 0.6902732	total: 51.5ms	remaining: 6.39s
4:	learn: 0.6896367	total: 52.3ms	remaining: 5.17s
5:	learn: 0.6891341	total: 53ms	remaining: 4.36s
6:	learn: 0.6884925	total: 53.7ms	remaining: 3.78s
7:	learn: 0.6879174	total: 54.4ms	remaining: 3.35s
8:	learn: 0.6872427	total: 55.2ms	remaining: 3.01s
9:	learn: 0.6864232	total: 55.9ms	remaining: 2.74s
10:	learn: 0.6857411	total: 56.7ms	remaining: 2.52s
11:	learn: 0.6849600	total: 57.4ms	remaining: 2.33s
12:	learn: 0.6843591	total: 58.1ms	remaining: 2.18s
13:	learn: 0.6837603	total: 58.8ms	remaining: 2.04s
14:	learn: 0.6831318	total: 59.5ms	remaining: 1.92s
15:	learn: 0.6823929	total: 60.3ms	remaining: 1.82s
16:	learn: 0.6816147	total: 61ms	remaining: 1.73s
17:	learn: 0.6811756	total: 61.7ms	remaining: 1.65s
18:	learn: 0.6805009	total: 62.4ms	remaining: 1.58s
19:	learn: 0.6798385	total: 

In [7]:
# Re-instantiating the models with the best hyper-parameters obtained
cbc = CatBoostClassifier(reg_lambda = 4, n_estimators = 500, max_depth = 4, learning_rate = 0.001)
clf = RandomForestClassifier(n_estimators = 50, min_samples_split = 7, min_samples_leaf = 6, max_features = 'log2')

In [8]:
# Instantiating the KFold split  
split = KFold(n_splits=10, shuffle = True, random_state=42)

# Split data into train and test using stratified split to avoid bias
for train_index, test_index in split.split(x, y):
    train = hf_data.loc[train_index]
    test = hf_data.loc[test_index]

In [9]:
# Taking only the best features obtained
xtrain = train[['time', 'ejection_fraction', 'serum_creatinine', 'age', 'serum_sodium', 'high_blood_pressure', 'creatinine_phosphokinase']]
ytrain = train['DEATH_EVENT']
xtest = test[['time', 'ejection_fraction', 'serum_creatinine', 'age', 'serum_sodium', 'high_blood_pressure', 'creatinine_phosphokinase']]
ytest = test['DEATH_EVENT']

In [10]:
# Fitting each model
cbc.fit(xtrain, ytrain) # CatBoostClassifier
clf.fit(xtrain, ytrain) # RandomForestClassifier

0:	learn: 0.6925227	total: 848us	remaining: 424ms
1:	learn: 0.6919526	total: 1.96ms	remaining: 487ms
2:	learn: 0.6913832	total: 2.81ms	remaining: 466ms
3:	learn: 0.6907210	total: 3.55ms	remaining: 440ms
4:	learn: 0.6899161	total: 4.3ms	remaining: 426ms
5:	learn: 0.6892100	total: 5.06ms	remaining: 417ms
6:	learn: 0.6884943	total: 6.03ms	remaining: 425ms
7:	learn: 0.6879882	total: 6.75ms	remaining: 415ms
8:	learn: 0.6873955	total: 7.5ms	remaining: 409ms
9:	learn: 0.6868130	total: 8.25ms	remaining: 404ms
10:	learn: 0.6860281	total: 9.01ms	remaining: 401ms
11:	learn: 0.6853275	total: 9.76ms	remaining: 397ms
12:	learn: 0.6847338	total: 10.5ms	remaining: 393ms
13:	learn: 0.6840158	total: 11.3ms	remaining: 393ms
14:	learn: 0.6832304	total: 12.1ms	remaining: 391ms
15:	learn: 0.6825819	total: 12.9ms	remaining: 390ms
16:	learn: 0.6820937	total: 13.6ms	remaining: 387ms
17:	learn: 0.6813629	total: 14.5ms	remaining: 388ms
18:	learn: 0.6807947	total: 15.2ms	remaining: 385ms
19:	learn: 0.6801556	tota

RandomForestClassifier(max_features='log2', min_samples_leaf=6,
                       min_samples_split=7, n_estimators=50)

In [11]:
# Obtaining the predictions for both models
cbc_pred = cbc.predict(xtest) # CatBoostClassifier Predictions
clf_pred = clf.predict(xtest) # RandomForestClassifier Predictions

In [12]:
# Obtaining the Accuracy Score of both models
print('Accuracy score for CatBoostClassifier = {}'.format(accuracy_score(ytest, cbc_pred)))
print('Accuracy score for RandomForestClassifier = {}'.format(accuracy_score(ytest, clf_pred)))

Accuracy score for CatBoostClassifier = 0.9310344827586207
Accuracy score for RandomForestClassifier = 0.9655172413793104


In [13]:
# Obtaining the F1 Score of both models
print('F1 Score for CatBoostClassifier = {}'.format(f1_score(ytest, cbc_pred)))
print('F1 Score for RandomForestClassifier = {}'.format(f1_score(ytest, clf_pred)))

F1 Score for CatBoostClassifier = 0.8000000000000002
F1 Score for RandomForestClassifier = 0.9090909090909091


In [14]:
#Obtaining the Confusion Matrix of both models
print('Confusion Matrix for CatBoostClassifier = \n{}'.format(confusion_matrix(ytest, cbc_pred)))
print('Confusion Matrix for RandomForestClassifier = \n{}'.format(confusion_matrix(ytest, clf_pred)))

Confusion Matrix for CatBoostClassifier = 
[[23  1]
 [ 1  4]]
Confusion Matrix for RandomForestClassifier = 
[[23  1]
 [ 0  5]]
