In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
#loading dataset
dataset = pd.read_csv('https://raw.githubusercontent.com/gogzicole/stage-f-07-heart-failure/master/data/heart_failure_clinical_records_dataset.csv')
dataset.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [3]:
dataset.shape

(299, 13)

In [4]:
dataset.describe()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
count,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0
mean,60.833893,0.431438,581.839465,0.41806,38.083612,0.351171,263358.029264,1.39388,136.625418,0.648829,0.32107,130.26087,0.32107
std,11.894809,0.496107,970.287881,0.494067,11.834841,0.478136,97804.236869,1.03451,4.412477,0.478136,0.46767,77.614208,0.46767
min,40.0,0.0,23.0,0.0,14.0,0.0,25100.0,0.5,113.0,0.0,0.0,4.0,0.0
25%,51.0,0.0,116.5,0.0,30.0,0.0,212500.0,0.9,134.0,0.0,0.0,73.0,0.0
50%,60.0,0.0,250.0,0.0,38.0,0.0,262000.0,1.1,137.0,1.0,0.0,115.0,0.0
75%,70.0,1.0,582.0,1.0,45.0,1.0,303500.0,1.4,140.0,1.0,1.0,203.0,1.0
max,95.0,1.0,7861.0,1.0,80.0,1.0,850000.0,9.4,148.0,1.0,1.0,285.0,1.0


In [5]:
# selecting feature matrix and target variable
X = dataset.drop(columns = 'DEATH_EVENT')
y = dataset['DEATH_EVENT']
y.value_counts()

0    203
1     96
Name: DEATH_EVENT, dtype: int64

In [6]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import  ColumnTransformer
from sklearn.ensemble import RandomForestClassifier

In [7]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       299 non-null    float64
 1   anaemia                   299 non-null    int64  
 2   creatinine_phosphokinase  299 non-null    int64  
 3   diabetes                  299 non-null    int64  
 4   ejection_fraction         299 non-null    int64  
 5   high_blood_pressure       299 non-null    int64  
 6   platelets                 299 non-null    float64
 7   serum_creatinine          299 non-null    float64
 8   serum_sodium              299 non-null    int64  
 9   sex                       299 non-null    int64  
 10  smoking                   299 non-null    int64  
 11  time                      299 non-null    int64  
dtypes: float64(3), int64(9)
memory usage: 28.2 KB


In [8]:
X.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8


In [9]:
# function to engineer features for the serum_creatinine feature
def serum_creatinine(x):
    if (x >= 0.6) & (x<= 1.3):
        return 'normal'
    else:
        return 'abnormal'

In [10]:
# function to engineer features for the serum_sodium feature
def serum_sodium(x):
    if ((x>= 135) & (x <= 145)):
        return 'normal'
    else:
        return 'abnormal'

In [11]:
# function to engineer features for the creatinine_phosphokinase feature
def cpk(x):
    if ((x >= 10) & (x <=120)):
        return 'normal'
    elif ((x > 120) & (x <=250)):
        return 'high'
    else:
        return 'very_high'

In [12]:
#Creating new features by applying our custom functions on appropriate features

X['serum_creatinine_result'] = X['serum_creatinine'].apply(serum_creatinine)

X['serum_sodium_result'] = X['serum_sodium'].apply(serum_sodium)

X['creatinine_phosphokinase_result'] = X['creatinine_phosphokinase'].apply(cpk)

In [13]:
# creating dummy variables for our newly created categorical features

X = pd.get_dummies(X, columns = ['serum_creatinine_result', 'serum_sodium_result', 'creatinine_phosphokinase_result'], drop_first=True)

In [21]:
# X = pd.get_dummies(X, columns = ['anaemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking'], drop_first=True)

In [14]:
X.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,serum_creatinine_result_normal,serum_sodium_result_normal,creatinine_phosphokinase_result_normal,creatinine_phosphokinase_result_very_high
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,0,0,0,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1,1,0,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1,0,0,0
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,0,1,1,0
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,0,0,0,0


In [17]:
#perform Smote on X and y and reassign the new variables to x_balanced and ya_balanced
import imblearn
from imblearn.over_sampling import SMOTENC
smote_nc = SMOTENC(categorical_features=[1,3,5,9,10,12,13,14,15], random_state=1)
X_balanced, y_balanced = smote_nc.fit_resample(X, y)
X_balanced = pd.DataFrame(X_balanced, columns = X.columns)


In [29]:
X_balanced.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,serum_creatinine_result_normal,serum_sodium_result_normal,creatinine_phosphokinase_result_normal,creatinine_phosphokinase_result_very_high
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,0,0,0,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1,1,0,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1,0,0,0
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,0,1,1,0
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,0,0,0,0


In [30]:
X_balanced.tail()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,serum_creatinine_result_normal,serum_sodium_result_normal,creatinine_phosphokinase_result_normal,creatinine_phosphokinase_result_very_high
401,73.704778,1,77,0,41,0,363596.244448,3.222867,131,1,0,18,0,1,0,0
402,63.429035,0,126,0,26,0,239090.761695,1.179208,137,0,0,68,0,0,0,0
403,45.678446,1,230,0,17,0,212386.925616,1.381743,135,1,0,68,0,1,0,0
404,85.762836,0,582,0,37,1,263358.03,1.83,134,1,0,96,0,0,0,1
405,64.465106,0,168,0,62,1,262463.898082,1.574885,137,1,1,13,1,1,0,0


In [18]:
# we can see that the numbr of observations increased from 299 to 406
# and also both instances of 1 and 0 in y are now equal
print('the shape of X is: {}'.format(X.shape))
print('the shape of y is: {}'.format(y.shape))
print('the shape of X_balanced is: {}'.format(X_balanced.shape))
print('the shape of y_balanced is: {}'.format(pd.Series(y_balanced).shape))
print('the instances of 1 and 0 in DEATH_EVENT are:')
print(pd.Series(y_balanced).value_counts())

the shape of X is: (299, 16)
the shape of y is: (299,)
the shape of X_balanced is: (406, 16)
the shape of y_balanced is: (406,)
the instances of 1 and 0 in DEATH_EVENT are:
1    203
0    203
Name: DEATH_EVENT, dtype: int64


In [19]:
# X

In [18]:
X.columns

Index(['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time',
       'serum_creatinine_result_normal', 'serum_sodium_result_normal',
       'creatinine_phosphokinase_result_normal',
       'creatinine_phosphokinase_result_very_high'],
      dtype='object')

In [20]:
from sklearn.metrics import make_scorer, recall_score

In [21]:
make_column_transformer = [0,2,4,6,7,8,11] # This variable specifies the columns that would be scaled, while the rest are passthrough
# make_column_transformer = [1,2,3,4,5,6]

scorers = {
#     'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score)
}

pipe5 = Pipeline([('col_transform', ColumnTransformer(remainder = 'passthrough',
                transformers = [('scaler',MinMaxScaler(),make_column_transformer)])),
                ('grid_search',GridSearchCV(RandomForestClassifier(),
                {'max_depth':range(1,15), 'n_estimators': [50, 100,300,500],
                'max_features': ['auto', 'sqrt', 'log'], 
                'min_samples_split': [2,3,5,7,9],'min_samples_leaf': [1,2,4,6,8]},
                cv = 5, n_jobs = -1, scoring = scorers, refit='recall_score', verbose = 1))])

Xt5 = pipe5.named_steps['col_transform'].fit_transform(X_balanced) #fitting and transforming the smoted feature matrix to the transformer

x_train, x_test,y_train, y_test = train_test_split(Xt5, y_balanced, test_size = 0.2, random_state = 0)

pipe5.fit(x_train,y_train) # perfoming the GridSearchCV by calling the pipe fit method

Fitting 5 folds for each of 4200 candidates, totalling 21000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   16.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   51.9s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 10.7min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed: 14.2min
[Parallel(n_jobs=-1)]: Done 4992 tasks      | elapsed: 17.0min
[Parallel(n_jobs=-1)]: Done 6042 tasks      | elapsed: 20.0min
[Parallel(n_jobs=-1)]: Done 7192 tasks      | elapsed: 24.5min
[Parallel(n_jobs=-1)]: Done 8442 tasks      | elapsed: 29.0min
[Parallel(n_jobs=-1)]: Done 9792 tasks      | elapsed: 369.2min
[Parallel(n_jobs=-1)]: Done 11242 tasks      

Pipeline(steps=[('col_transform',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('scaler', MinMaxScaler(),
                                                  [0, 2, 4, 6, 7, 8, 11])])),
                ('grid_search',
                 GridSearchCV(cv=5, estimator=RandomForestClassifier(),
                              n_jobs=-1,
                              param_grid={'max_depth': range(1, 15),
                                          'max_features': ['auto', 'sqrt',
                                                           'log'],
                                          'min_samples_leaf': [1, 2, 4, 6, 8],
                                          'min_samples_split': [2, 3, 5, 7, 9],
                                          'n_estimators': [50, 100, 300, 500]},
                              refit='recall_score',
                              scoring={'recall_score': make_scorer(recall_score)},
                      

In [22]:
model5 = pipe5.named_steps['grid_search'].best_estimator_ #Training the RandomForestClassifier with the best parameters from the search
model5.fit(x_train, y_train)
y_pred5 = model5.predict(x_test)

In [23]:
from sklearn.metrics import classification_report, recall_score, accuracy_score,precision_score, f1_score, roc_auc_score
recall5 = recall_score(y_test,y_pred5)
accuracy5 = accuracy_score(y_test,y_pred5)
precision5 = precision_score(y_test,y_pred5)
f1score5 = f1_score(y_test,y_pred5)
aucscore5 = roc_auc_score(y_test,y_pred5)
print('the Recall for smoted RandomForest is:{}'.format(round(recall5,4)))
print(f'the Accuracy for smoted RandomForest is:{round(accuracy5,4)}')
print('the Precision for smoted RandomForest is: %s' %(round(precision5,4)))
print(f'the F1_score for smoted RandomForest is: {round(f1score5,4)}')
print(f'the auc_score for smoted RandomForest is: {round(aucscore5,4)}')

the Recall for smoted RandomForest is:0.9787
the Accuracy for smoted RandomForest is:0.9024
the Precision for smoted RandomForest is: 0.8679
the F1_score for smoted RandomForest is: 0.92
the auc_score for smoted RandomForest is: 0.8894


In [24]:
# k = pd.DataFrame(pipe5.named_steps['grid_search'].cv_results_)
# # pipe5.named_steps['grid_search'].best_estimator_ 
# k

In [25]:
print(classification_report(y_test, y_pred5))

              precision    recall  f1-score   support

           0       0.97      0.80      0.88        35
           1       0.87      0.98      0.92        47

    accuracy                           0.90        82
   macro avg       0.92      0.89      0.90        82
weighted avg       0.91      0.90      0.90        82



In [26]:
# To see the confusion matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred5))

[[28  7]
 [ 1 46]]


In [27]:
# saving the model to Disk
import dill
with open('RandomForest_Smotenc.dill','wb') as f:
    dill.dump(model5,f)

In [28]:
#checking if model is saved successfully on disk
!ls -alh RandomForest_Smotenc.dill

-rw-r--r-- 1 CHARLIE None 659K Nov  6 05:26 RandomForest_Smotenc.dill
