In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
#loading dataset
dataset = pd.read_csv('https://raw.githubusercontent.com/gogzicole/stage-f-07-heart-failure/master/data/heart_failure_clinical_records_dataset.csv')
dataset.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [3]:
# selecting feature matrix and target variable
X = dataset.drop(columns = 'DEATH_EVENT')
y = dataset['DEATH_EVENT']
y.value_counts()

0    203
1     96
Name: DEATH_EVENT, dtype: int64

In [4]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import  ColumnTransformer
from sklearn.ensemble import RandomForestClassifier

In [5]:
X.columns

Index(['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time'],
      dtype='object')

In [7]:
# to check properties of serum_sodium
X.serum_sodium.describe()

count    299.000000
mean     136.625418
std        4.412477
min      113.000000
25%      134.000000
50%      137.000000
75%      140.000000
max      148.000000
Name: serum_sodium, dtype: float64

In [8]:
# to check properties of ejection_fraction
X.ejection_fraction.describe()

count    299.000000
mean      38.083612
std       11.834841
min       14.000000
25%       30.000000
50%       38.000000
75%       45.000000
max       80.000000
Name: ejection_fraction, dtype: float64

In [9]:
# to check properties of creatinine_phosphokinase
X.creatinine_phosphokinase.describe()

count     299.000000
mean      581.839465
std       970.287881
min        23.000000
25%       116.500000
50%       250.000000
75%       582.000000
max      7861.000000
Name: creatinine_phosphokinase, dtype: float64

In [6]:
# trialz = X.copy()

In [7]:
# function to engineer features for the serum_creatinine feature
def serum_creatinine(x):
    if (x >= 0.6) & (x<= 1.3):
        return 'normal'
    else:
        return 'abnormal'
    

In [8]:
# function to engineer features for the serum_sodium feature
def serum_sodium(x):
    if ((x>= 135) & (x <= 145)):
        return 'normal'
    else:
        return 'abnormal'
    

In [9]:
# function to engineer features for the creatinine_phosphokinase feature
def cpk(x):
    if ((x >= 10) & (x <=120)):
        return 'normal'
    elif ((x > 120) & (x <=250)):
        return 'high'
    else:
        return 'very_high'
    

In [11]:
#Creating new features by applying our custom functions on appropriate features

X['serum_creatinine_result'] = X['serum_creatinine'].apply(serum_creatinine)

X['serum_sodium_result'] = X['serum_sodium'].apply(serum_sodium)

X['creatinine_phosphokinase_result'] = X['creatinine_phosphokinase'].apply(cpk)

In [15]:
# creating dummy variables for our newly created categorical features

X = pd.get_dummies(X, columns = ['serum_creatinine_result', 'serum_sodium_result', 'creatinine_phosphokinase_result'], drop_first=True)

In [14]:
X.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,serum_creatinine_result_normal,serum_sodium_result_normal,creatinine_phosphokinase_result_normal,creatinine_phosphokinase_result_very_high
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,0,0,0,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1,1,0,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1,0,0,0
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,0,1,1,0
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,0,0,0,0


In [17]:
#perform Smote on X and y and reassign the new variables to x_balanced and ya_balanced
import imblearn
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=1)
X_balanced, y_balanced = smote.fit_sample(X, y)
X_balanced = pd.DataFrame(X_balanced, columns = X.columns)

In [19]:
# we can see that the numbr of observations increased from 299 to 406
# and also both instances of 1 and 0 in y are now equal
print('the shape of X is: {}'.format(X.shape))
print('the shape of y is: {}'.format(y.shape))
print('the shape of X_balanced is: {}'.format(X_balanced.shape))
print('the shape of y_balanced is: {}'.format(pd.Series(y_balanced).shape))
print('the instances of 1 and 0 in DEATH_EVENT are:')
print(pd.Series(y_balanced).value_counts())

the shape of X is: (299, 16)
the shape of y is: (299,)
the shape of X_balanced is: (406, 16)
the shape of y_balanced is: (406,)
the instances of 1 and 0 in DEATH_EVENT are:
1    203
0    203
Name: DEATH_EVENT, dtype: int64


In [21]:
# I would perform MinMaxScaling using ColumnTransform to passthrough some columns
# Also i would perform GridSearchCV for the RandomForestClassifier
# The whole process would be encapsulated in an sklearn pipeline.

make_column_transformer = [2,4,6,7,8,11] # This variable specifies the columns that would be scaled, while the rest are passthrough

pipe5 = Pipeline([('col_transform', ColumnTransformer(remainder = 'passthrough',
                transformers = [('scaler',MinMaxScaler(),make_column_transformer)])),
                ('grid_search',GridSearchCV(RandomForestClassifier(),
                {'max_depth':range(1,15), 'n_estimators': [50, 100,300,500],
                'max_features': ['auto', 'sqrt', 'log'], 
                'min_samples_split': [2,3,5,7,9],'min_samples_leaf': [1,2,4,6,8]},
                cv = 3, n_jobs = 2, scoring = 'accuracy', verbose = 1))])

Xt5 = pipe5.named_steps['col_transform'].fit_transform(X_balanced) #fitting and transforming the smoted feature matrix to the transformer

x_train, x_test,y_train, y_test = train_test_split(Xt5, y_balanced, test_size = 0.2, random_state = 0)

pipe5.fit(x_train,y_train) # perfoming the GridSearchCV by calling the pipe fit method

Fitting 3 folds for each of 4200 candidates, totalling 12600 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   15.2s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   56.6s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:  2.1min
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:  3.1min
[Parallel(n_jobs=2)]: Done 1246 tasks      | elapsed:  4.9min
[Parallel(n_jobs=2)]: Done 1796 tasks      | elapsed:  6.6min
[Parallel(n_jobs=2)]: Done 2446 tasks      | elapsed:  9.1min
[Parallel(n_jobs=2)]: Done 3196 tasks      | elapsed: 11.5min
[Parallel(n_jobs=2)]: Done 4046 tasks      | elapsed: 14.2min
[Parallel(n_jobs=2)]: Done 4996 tasks      | elapsed: 17.3min
[Parallel(n_jobs=2)]: Done 6046 tasks      | elapsed: 20.8min
[Parallel(n_jobs=2)]: Done 7196 tasks      | elapsed: 24.1min
[Parallel(n_jobs=2)]: Done 8446 tasks      | elapsed: 29.0min
[Parallel(n_jobs=2)]: Done 9796 tasks      | elapsed: 33.9min
[Parallel(n_jobs=2)]: Done 11246 tasks      | elapsed: 39.6mi

Pipeline(steps=[('col_transform',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('scaler', MinMaxScaler(),
                                                  [2, 4, 6, 7, 8, 11])])),
                ('grid_search',
                 GridSearchCV(cv=3, estimator=RandomForestClassifier(),
                              n_jobs=2,
                              param_grid={'max_depth': range(1, 15),
                                          'max_features': ['auto', 'sqrt',
                                                           'log'],
                                          'min_samples_leaf': [1, 2, 4, 6, 8],
                                          'min_samples_split': [2, 3, 5, 7, 9],
                                          'n_estimators': [50, 100, 300, 500]},
                              scoring='accuracy', verbose=1))])

In [22]:
model5 = pipe5.named_steps['grid_search'].best_estimator_ #Training the RandomForestClassifier with the best parameters from the search
model5.fit(x_train, y_train)
y_pred5 = model5.predict(x_test)

In [23]:
from sklearn.metrics import classification_report, recall_score, accuracy_score,precision_score, f1_score
recall5 = recall_score(y_test,y_pred5)
accuracy5 = accuracy_score(y_test,y_pred5)
precision5 = precision_score(y_test,y_pred5)
f1score5 = f1_score(y_test,y_pred5)
print('the Recall for smoted RandomForest is:{}'.format(round(recall5,4)))
print(f'the Accuracy for smoted RandomForest is:{round(accuracy5,4)}')
print('the Precision for smoted RandomForest is: %s' %(round(precision5,4)))
print(f'the F1_score for smoted RandomForest is: {round(f1score5,4)}')

the Recall for smoted RandomForest is:0.9574
the Accuracy for smoted RandomForest is:0.939
the Precision for smoted RandomForest is: 0.9375
the F1_score for smoted RandomForest is: 0.9474


In [29]:
print(classification_report(y_test, y_pred5))

              precision    recall  f1-score   support

           0       0.94      0.91      0.93        35
           1       0.94      0.96      0.95        47

    accuracy                           0.94        82
   macro avg       0.94      0.94      0.94        82
weighted avg       0.94      0.94      0.94        82



In [26]:
# To see the confusion matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred5))

[[32  3]
 [ 2 45]]


In [27]:
# saving the model to Disk
import dill
with open('RandomForest.dill','wb') as f:
    dill.dump(model5,f)

In [28]:
#checking if model is saved successfully on disk
!ls -alh RandomForest.dill

-rw-r--r-- 1 CHARLIE None 228K Nov  4 01:41 RandomForest.dill
