In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns
%matplotlib inline

In [2]:
data = pd.read_csv('heart_failure_clinical_records_dataset.csv')

In [76]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       299 non-null    float64
 1   anaemia                   299 non-null    int64  
 2   creatinine_phosphokinase  299 non-null    int64  
 3   diabetes                  299 non-null    int64  
 4   ejection_fraction         299 non-null    int64  
 5   high_blood_pressure       299 non-null    int64  
 6   platelets                 299 non-null    float64
 7   serum_creatinine          299 non-null    float64
 8   serum_sodium              299 non-null    int64  
 9   sex                       299 non-null    int64  
 10  smoking                   299 non-null    int64  
 11  time                      299 non-null    int64  
 12  DEATH_EVENT               299 non-null    int64  
dtypes: float64(3), int64(10)
memory usage: 30.5 KB


In [80]:
data.columns.to_list()

['age',
 'anaemia',
 'creatinine_phosphokinase',
 'diabetes',
 'ejection_fraction',
 'high_blood_pressure',
 'platelets',
 'serum_creatinine',
 'serum_sodium',
 'sex',
 'smoking',
 'time',
 'DEATH_EVENT']

In [4]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

In [5]:
X = data.loc[:,'age':'time']
y = data.loc[:,["DEATH_EVENT"]]

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

### Random Forrest

In [62]:
#Defining the estimator
est = RandomForestClassifier()
#Establishing which hyperparameters to use in the grid search
param_grid = {
    'max_depth':range(2,7),
    'n_estimators':range(10,30)
}
#Creating grid search:
grid_search_rf = GridSearchCV(
    estimator=est,
    param_grid=param_grid,
    scoring='accuracy',
    n_jobs=-1,
    cv=5,
    return_train_score=True
)
grid_search_rf.fit(
    X_train,
    y_train.values.ravel()
)

results = pd.DataFrame(grid_search_rf.cv_results_)
print('Best Parameters were: {}'.format(grid_search_rf.best_params_))
print('Best CrossVal Score was: {}'.format(grid_search_rf.best_score_))

Best Parameters were: {'max_depth': 6, 'n_estimators': 11}
Best CrossVal Score was: 0.875725900116144


In [69]:
rf_clf = RandomForestClassifier(
    max_depth = 6,
    n_estimators = 11
)
rf_clf.fit(
    X_train,
    y_train.values.ravel()
)
print('Random Forrest Accuracy: {}'.format(rf_clf.score(X_test, y_test)))

Random Forrest Accuracy: 0.8555555555555555


### GradientBoost

In [64]:
from sklearn.ensemble import GradientBoostingClassifier

In [65]:
est = GradientBoostingClassifier()
param_grid = {
    'loss':['deviance','exponential'],
    'max_depth':range(2,7),
    'max_features':['sqrt','log2']
}
grid_search_gb = GridSearchCV(
    estimator = est,
    param_grid = param_grid,
    scoring = 'accuracy',
    n_jobs = -1,
    cv = 5,
    return_train_score = True
)
grid_search_gb.fit(
    X_train, 
    y_train.values.ravel()
)
results = pd.DataFrame(grid_search_gb.cv_results_)
print('Best Parameters were: {}'.format(grid_search_gb.best_params_))
print('Best CrossVal Score was: {}'.format(grid_search_gb.best_score_))

Best Parameters were: {'loss': 'deviance', 'max_depth': 4, 'max_features': 'log2'}
Best CrossVal Score was: 0.8468060394889664


In [72]:
gb_clf = GradientBoostingClassifier(
    loss = 'deviance',
    max_depth=4,
    max_features='log2'
)
gb_clf.fit(X_train, y_train.values.ravel())
print('Gradient Boost Accuracy: {}'.format(gb_clf.score(X_test, y_test)))

Gradient Boost Accuracy: 0.8333333333333334


In [73]:
data.columns.to_list()

['age',
 'anaemia',
 'creatinine_phosphokinase',
 'diabetes',
 'ejection_fraction',
 'high_blood_pressure',
 'platelets',
 'serum_creatinine',
 'serum_sodium',
 'sex',
 'smoking',
 'time',
 'DEATH_EVENT']

In [75]:
data['time']

0        4
1        6
2        7
3        7
4        8
      ... 
294    270
295    271
296    278
297    280
298    285
Name: time, Length: 299, dtype: int64