In [1]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame,read_csv
from collections import Counter
from sklearn.utils import shuffle

In [2]:
df = read_csv('heart_failure.csv')
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [3]:
# Re-assign the binary numbers to a boolean label (anaemia,diabetes,high_blood_pressure,sex,smoking)
df['anaemia'] = np.where(df['anaemia'] == 1 ,True,False)
df['diabetes'] = np.where(df['diabetes'] == 1, True, False)
df['high_blood_pressure'] = np.where(df['high_blood_pressure'] == 1, True, False)
df['smoking'] = np.where(df['smoking'] == 1, True, False)
df['sex'] = np.where(df['sex'] == 1, 'Male','Female')
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75,False,582,False,20,True,265000.0,1.9,130,Male,False,4,1
1,55,False,7861,False,38,False,263358.03,1.1,136,Male,False,6,1
2,65,False,146,False,20,False,162000.0,1.3,129,Male,True,7,1
3,50,True,111,False,20,False,210000.0,1.9,137,Male,False,7,1
4,65,True,160,True,20,False,327000.0,2.7,116,Female,False,8,1


**Feature engineering based on the actual medical values expected from each feature**

In [4]:
#cpk normal values from 10 to 120 micrograms per liter
def set_cpk(row):
    if row['creatinine_phosphokinase'] >= 10 and row['creatinine_phosphokinase'] <= 120:
        return 'Normal'
    else:
        return 'Abnormal'
df = df.assign(cp_desc = df.apply(set_cpk, axis =1))


In [5]:
df.cp_desc.value_counts()

Abnormal    222
Normal       77
Name: cp_desc, dtype: int64

In [6]:
def set_eject_fraction(row):
    if row['ejection_fraction'] > 50 and row['ejection_fraction'] <= 75:
        return 'Normal'
    else:
        return 'Abnormal'
df['ejection_fraction_desc'] =  df.apply(set_eject_fraction, axis =1)

In [7]:
def set_platelets(row):
    if row['sex'] == 'Female':
        if row['platelets'] >= 157000 and row['platelets'] <= 371000:
            return 'Normal'
        else:
            return 'Abnormal'
    elif row['sex'] == 'Male':
        if row['platelets'] >= 135000 and row['platelets'] <= 317000:
            return 'Normal'
        else:
            return 'Abnormal'
df['platelets_desc'] = df.apply(set_platelets, axis = 1)

In [8]:
def set_sodium(row):
    if row['serum_sodium'] >= 135 and row['serum_sodium'] <= 145:
        return 'Normal'
    else:
        return 'Abnormal'
df['sodium_desc'] = df.apply(set_sodium, axis =1)

In [9]:
def set_creatinine(row):
    if row['sex'] == 'Female':
        if  row['serum_creatinine'] >= 0.5 and  row['serum_creatinine'] <= 1.1:
            return 'Normal'
        else:
            return 'Abnormal'
    elif row['sex'] == 'Male':
        if  row['serum_creatinine'] >= 0.6 and row['serum_creatinine'] <= 1.2:
            return 'Normal'
        else:
            return 'Abnormal'
df['serum_creatinine_desc'] = df.apply(set_creatinine, axis = 1)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 18 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       299 non-null    int64  
 1   anaemia                   299 non-null    bool   
 2   creatinine_phosphokinase  299 non-null    int64  
 3   diabetes                  299 non-null    bool   
 4   ejection_fraction         299 non-null    int64  
 5   high_blood_pressure       299 non-null    bool   
 6   platelets                 299 non-null    float64
 7   serum_creatinine          299 non-null    float64
 8   serum_sodium              299 non-null    int64  
 9   sex                       299 non-null    object 
 10  smoking                   299 non-null    bool   
 11  time                      299 non-null    int64  
 12  DEATH_EVENT               299 non-null    int64  
 13  cp_desc                   299 non-null    object 
 14  ejection_f

In [11]:
x = df.drop('DEATH_EVENT', axis = 1)
y = df['DEATH_EVENT']
x.head(2)

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,cp_desc,ejection_fraction_desc,platelets_desc,sodium_desc,serum_creatinine_desc
0,75,False,582,False,20,True,265000.0,1.9,130,Male,False,4,Abnormal,Abnormal,Normal,Abnormal,Abnormal
1,55,False,7861,False,38,False,263358.03,1.1,136,Male,False,6,Abnormal,Abnormal,Normal,Normal,Normal


There is an imbalance between the target glass, so we use SMOTENC because in SMOTENC you can specify the categorical variables while  generating the new data points.

In [12]:
import imblearn
from imblearn.over_sampling import SMOTENC
smote = SMOTENC(random_state=1,categorical_features=[0,1,3,5,9,10,12,13,14,15,16])
x_bal, y_bal = smote.fit_sample(x, y)
x_bal = DataFrame(x_bal, columns = x.columns)
print(Series(y_bal).value_counts())
x_bal['serum_creatinine_desc'].value_counts()



1    203
0    203
dtype: int64


Normal      218
Abnormal    188
Name: serum_creatinine_desc, dtype: int64

Here  we create dummy variables for the newly engineered features.
Remeber that Python recognises true and false as 1 and 0, so there would be no need for encoding those columns

In [13]:
encode = ['sex','cp_desc','ejection_fraction_desc','platelets_desc',
         'sodium_desc','serum_creatinine_desc']
x_bal = pd.get_dummies(x_bal,columns = encode, drop_first = True)
print(x_bal.shape)
x_bal.head()

(406, 17)


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,smoking,time,sex_Male,cp_desc_Normal,ejection_fraction_desc_Normal,platelets_desc_Normal,sodium_desc_Normal,serum_creatinine_desc_Normal
0,75,False,582,False,20,True,265000,1.9,130,False,4,1,0,0,1,0,0
1,55,False,7861,False,38,False,263358,1.1,136,False,6,1,0,0,1,1,1
2,65,False,146,False,20,False,162000,1.3,129,True,7,1,0,0,1,0,0
3,50,True,111,False,20,False,210000,1.9,137,False,7,1,1,0,1,1,0
4,65,True,160,True,20,False,327000,2.7,116,False,8,0,0,0,1,0,0


In [14]:
#This seperation is simply for encoding, we dont want the categrocal Data to be expressed as decimal numbers after scaling,
# i.e 1 to 1.0 and 0 to 0.0 so we seperate the dataset scale the data then concatenate the data back with its labels.
data1 = x_bal[['age','creatinine_phosphokinase','ejection_fraction',
              'platelets','serum_creatinine','serum_sodium',
              'time']]
data2 = x_bal.drop(['age','creatinine_phosphokinase','ejection_fraction',
                   'platelets','serum_creatinine','serum_sodium',
                   'time'], axis = 1)

In [15]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data = DataFrame(scaler.fit_transform(data1), columns = data1.columns)
data.tail()

Unnamed: 0,age,creatinine_phosphokinase,ejection_fraction,platelets,serum_creatinine,serum_sodium,time
401,0.636364,0.007003,0.410735,0.410348,0.30594,0.53649,0.052696
402,0.581818,0.013242,0.189294,0.259414,0.076316,0.686091,0.228345
403,0.363636,0.026458,0.046115,0.227042,0.099072,0.651304,0.229445
404,0.836364,0.071319,0.358661,0.288833,0.149438,0.6,0.33053
405,0.636364,0.018502,0.728963,0.287749,0.120774,0.695946,0.034677


In [16]:
#Here we simply re join the scaled data back to its labels. 
x = pd.concat([data,data2], axis = 1)
x.head()

Unnamed: 0,age,creatinine_phosphokinase,ejection_fraction,platelets,serum_creatinine,serum_sodium,time,anaemia,diabetes,high_blood_pressure,smoking,sex_Male,cp_desc_Normal,ejection_fraction_desc_Normal,platelets_desc_Normal,sodium_desc_Normal,serum_creatinine_desc_Normal
0,0.636364,0.071319,0.090909,0.290823,0.157303,0.485714,0.0,False,False,True,False,1,0,0,1,0,0
1,0.272727,1.0,0.363636,0.288833,0.067416,0.657143,0.007117,False,False,False,False,1,0,0,1,1,1
2,0.454545,0.015693,0.090909,0.16596,0.089888,0.457143,0.010676,False,False,False,True,1,0,0,1,0,0
3,0.181818,0.011227,0.090909,0.224148,0.157303,0.685714,0.010676,True,False,False,False,1,1,0,1,1,0
4,0.454545,0.017479,0.090909,0.365984,0.247191,0.085714,0.014235,True,True,False,False,0,0,0,1,0,0


In [17]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y_bal, test_size = 0.2, random_state = 0)

In [18]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
#grid = GridSearchCV(RandomForestClassifier(),{'max_depth':range(1,15), 'n_estimators': [50, 100,300,500],
 #                    'max_features': ['auto', 'sqrt', 'log'], 
  #                   'min_samples_split': [2,3,5,7,9],'min_samples_leaf': [1,2,4,6,8]},
   #                   cv = 3, n_jobs = 2, scoring = 'accuracy', verbose = 1)

#grid.fit(x_train,y_train)

In [19]:
#i commented the grid search because i have determined its optimum parameters, using
#grid.best_params_, and its these optimum parameters that were used to train the model.
#grid.best_params_
#model = grid.best_estimator_
#model.fit(x_train,y_train)predict(x_test)

In [20]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(max_depth = 7, max_features= 'sqrt',random_state = 4,
                                min_samples_leaf = 1,
                                min_samples_split = 5,n_estimators = 100)
classifier.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=7, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=4, verbose=0,
                       warm_start=False)

In [21]:
y_pred = classifier.predict(x_test)

In [22]:
from sklearn.metrics import classification_report, recall_score, accuracy_score,precision_score, f1_score, confusion_matrix
recall = recall_score(y_test,y_pred)
accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
f1score = f1_score(y_test,y_pred)
print('the Recall for smoted RandomForest is:{}'.format(round(recall,4)))
print(f'the Accuracy for smoted RandomForest is:{round(accuracy,4)}')
print('the Precision for smoted RandomForest is: %s' %(round(precision,4)))
print(f'the F1_score for smoted RandomForest is:{round(f1score,4)}')

the Recall for smoted RandomForest is:0.9787
the Accuracy for smoted RandomForest is:0.9268
the Precision for smoted RandomForest is: 0.902
the F1_score for smoted RandomForest is:0.9388


In [23]:
report = classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.97      0.86      0.91        35
           1       0.90      0.98      0.94        47

    accuracy                           0.93        82
   macro avg       0.93      0.92      0.92        82
weighted avg       0.93      0.93      0.93        82



In [24]:
matrix = confusion_matrix(y_test,y_pred)
print(matrix)

[[30  5]
 [ 1 46]]


In [28]:
import pickle
pickle.dump(classifier, open('heart_main.pkl','wb'))