In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns
%matplotlib inline

In [2]:
data = pd.read_csv('heart_failure_clinical_records_dataset.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       299 non-null    float64
 1   anaemia                   299 non-null    int64  
 2   creatinine_phosphokinase  299 non-null    int64  
 3   diabetes                  299 non-null    int64  
 4   ejection_fraction         299 non-null    int64  
 5   high_blood_pressure       299 non-null    int64  
 6   platelets                 299 non-null    float64
 7   serum_creatinine          299 non-null    float64
 8   serum_sodium              299 non-null    int64  
 9   sex                       299 non-null    int64  
 10  smoking                   299 non-null    int64  
 11  time                      299 non-null    int64  
 12  DEATH_EVENT               299 non-null    int64  
dtypes: float64(3), int64(10)
memory usage: 30.5 KB


In [4]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, recall_score, precision_score, f1_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

In [5]:
X = data.loc[:,'age':'time']
y = data.loc[:,["DEATH_EVENT"]]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Vanilla Random Forrest

In [7]:
rf_vanil = RandomForestClassifier()
rf_vanil.fit(X_train, y_train.values.ravel())
print('Accuracy Score: {}'.format(rf_vanil.score(X_train, y_train))
print('CrossVal Score: {}'.format(rf_vanil.))

1.0

### Model has overfit, run hyperparameter tuning using gridsearch

In [58]:
#Defining the estimator
est = RandomForestClassifier(
    n_jobs=-1
)
#Establishing which hyperparameters to use in the grid search
param_grid = {
    'max_depth':range(2,7),
    'min_samples_leaf':range(5,30,5),
    'max_features':['sqrt','log2']
}
#Creating grid search:
grid_search = GridSearchCV(
    estimator=est,
    param_grid=param_grid,
    scoring='accuracy',
    n_jobs=-1,cv=5,
    return_train_score=True
)

In [56]:
grid_search.fit(
    X_train,
    y_train.values.ravel()
)

results = pd.DataFrame(grid_search.cv_results_)
print('Best Parameters were: {}'.format(grid_search.best_params_))
print('Best CrossVal Score was: {}'.format(grid_search.best_score_))

Best Parameters are: {'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 5}
Best CrossVal Score was: 0.8784574468085106


### Vanilla GradientBoost

In [54]:
from sklearn.ensemble import GradientBoostingClassifier

In [55]:
gb_vanil = GradientBoostingClassifier()
gb_vanil.fit(X_train, y_train.values.ravel())
print('Accuracy Score {}'.format(gb_vanil.score(X_train, y_train.values.ravel())))
print('CrossVal Score: {}'.format(cross_val_score(gb_vanil, X_train, y_train.values.ravel())))

Accuracy Score 1.0
CrossVal Score: [0.85416667 0.79166667 0.91666667 0.83333333 0.80851064]


### Model has overfit, run hyperparameter tuning (again) using GridSearch

In [60]:
est = GradientBoostingClassifier()
param_grid = {
    'loss':['deviance','exponential'],
    'n_estimators':range(100,500,100),
    'min_samples_leaf':range(1,10,2),
    'max_depth':range(3,10),
    'max_features':['sqrt','log2']
}
grid_search = GridSearchCV(
    estimator = est,
    param_grid = param_grid,
    scoring = 'accuracy',
    n_jobs = -1,
    cv = 5,
    return_train_score = True
)

In [63]:
# grid_search.fit(
#     X_train, 
#     y_train.values.ravel()
# )
# results = pd.DataFrame(grid_search.cv_results_)
print('Best Parameters were: {}'.format(grid_search.best_params_))
print('Best CrossVal Scores was: {}'.format(grid_search.best_score_))

Best Parameters were: {'loss': 'deviance', 'max_depth': 7, 'max_features': 'log2', 'min_samples_leaf': 3, 'n_estimators': 200}
Best CrossVal Scores was: 0.8826241134751773
