In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


https://archive.ics.uci.edu/ml/datasets/Heart+failure+clinical+records

In [None]:
path = '/content/drive/MyDrive/Knowledge Discovery/assignment/'

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.feature_selection import chi2
from mlxtend.feature_selection import SequentialFeatureSelector, ExhaustiveFeatureSelector



# Load the Data

In [None]:
data = pd.read_csv(path + 'heart_failure_clinical_records_dataset.csv')

In [None]:
data

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.00,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.00,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.00,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.00,2.7,116,0,0,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,62.0,0,61,1,38,1,155000.00,1.1,143,1,1,270,0
295,55.0,0,1820,0,38,0,270000.00,1.2,139,0,0,271,0
296,45.0,0,2060,1,60,0,742000.00,0.8,138,0,0,278,0
297,45.0,0,2413,0,38,0,140000.00,1.4,140,1,1,280,0


In [None]:
data.describe()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
count,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0
mean,60.833893,0.431438,581.839465,0.41806,38.083612,0.351171,263358.029264,1.39388,136.625418,0.648829,0.32107,130.26087,0.32107
std,11.894809,0.496107,970.287881,0.494067,11.834841,0.478136,97804.236869,1.03451,4.412477,0.478136,0.46767,77.614208,0.46767
min,40.0,0.0,23.0,0.0,14.0,0.0,25100.0,0.5,113.0,0.0,0.0,4.0,0.0
25%,51.0,0.0,116.5,0.0,30.0,0.0,212500.0,0.9,134.0,0.0,0.0,73.0,0.0
50%,60.0,0.0,250.0,0.0,38.0,0.0,262000.0,1.1,137.0,1.0,0.0,115.0,0.0
75%,70.0,1.0,582.0,1.0,45.0,1.0,303500.0,1.4,140.0,1.0,1.0,203.0,1.0
max,95.0,1.0,7861.0,1.0,80.0,1.0,850000.0,9.4,148.0,1.0,1.0,285.0,1.0


The description of the features of the dataset is as follows: (https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-020-1023-5/tables/1)

Feature | Explanation
--------|--------------
Age | The age of the patient
Anaemia | The decrease of red blood cells or hemoglobin 
High blood pressure | Whether a patient has hypertension
Creatinine phosphokinase | Level of CPK enzyme in the blood
Diabetes | Whether a patient has diabetes
Ejection fraction | Percentage of blood leaving the heart at each contraction
Sex | Whether a person is female or male
Platelets | Platelets in the blood
Serum creatinine | The level of creatinine in the blood
Serum sodium | The level of sodium in the blood
Smoking | Whether the patient smokes
Time | The follow-up period
Death event | The target value explaining whether the patient died during the follow-up period

For categorical features, the values have the following meaning: (https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-020-1023-5/tables/2)

Feature |  Explanation
--------|-------------
Anaemia | 0 -> False, 1 -> True
High blood pressure | 0 -> False, 1 -> True
Diabetes | 0 -> False, 1 -> True
Sex | 0 -> woman, 1 -> man
Smoking | 0 -> False, 1 -> True

# Data Imputation

In [None]:
continuous_features = ['age', 
                       'creatinine_phosphokinase', 
                       'ejection_fraction', 
                       'platelets',
                       'serum_creatinine', 
                       'serum_sodium', 
                       'time']

In [None]:
data.isna().sum()

age                         0
anaemia                     0
creatinine_phosphokinase    0
diabetes                    0
ejection_fraction           0
high_blood_pressure         0
platelets                   0
serum_creatinine            0
serum_sodium                0
sex                         0
smoking                     0
time                        0
DEATH_EVENT                 0
dtype: int64

Since none of the feature have any missing data, then there is no need to perform data imputation.

# Split the data into input and target, Standardize the continuous features

In [None]:
scaled_data = data.copy()
scaler = StandardScaler()

In [None]:
# Standardize the continuous features
for feature in continuous_features:
  scaled_data[feature] = scaler.fit_transform(data[feature].values.reshape(-1,1))

In [None]:
# split the scaled data into input and target vectors
Y = scaled_data['DEATH_EVENT']
X = scaled_data.drop('DEATH_EVENT', axis=1)

In [None]:
# This function will perform 10 fold cross validation on the provided model.
def perform_ten_fold_cv(model, x_data, y_data):
  cv = KFold(n_splits=10, random_state=42, shuffle=True)

  metrics = {'acc': [], 
             'pre': [],
             'rec': [],
             'f1': []}

  for train_indices, test_indices in cv.split(x_data):
    x_train = x_data.iloc[train_indices]
    y_train = y_data.iloc[train_indices]

    x_test = x_data.iloc[test_indices]
    y_test = y_data.iloc[test_indices]

    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)

    metrics['acc'].append(accuracy_score(y_test, y_pred))
    metrics['pre'].append(recall_score(y_test, y_pred))
    metrics['rec'].append(precision_score(y_test, y_pred))
    metrics['f1'].append(f1_score(y_test, y_pred))

  return metrics

# Helper function to print the metrics data.
def print_metrics(metrics):
  print('Accuracy:', np.mean(metrics['acc']))
  print('Precision:', np.mean(metrics['pre']))
  print('Recall:', np.mean(metrics['rec']))
  print('F1 Score:', np.mean(metrics['f1']))

# Model prediction without any feature selection with 10 fold cross validation

## Naive Bayes

In [None]:
nb = GaussianNB()
nb_metrics = perform_ten_fold_cv(nb, X, Y)

In [None]:
print_metrics(nb_metrics)

Accuracy: 0.7663218390804598
Precision: 0.47068681318681316
Recall: 0.731547619047619
F1 Score: 0.5528505756369533


## Logistic Regression

In [None]:
lr = LogisticRegression()
lr_metrics = perform_ten_fold_cv(lr, X, Y)

In [None]:
print_metrics(lr_metrics)

Accuracy: 0.8228735632183909
Precision: 0.6813064713064713
Recall: 0.746051171051171
F1 Score: 0.6896440047590943


## Decision Tree

In [None]:
dt = DecisionTreeClassifier()
dt_metrics = perform_ten_fold_cv(dt, X, Y)

In [None]:
print_metrics(dt_metrics)

Accuracy: 0.789655172413793
Precision: 0.6835653235653236
Recall: 0.6874891774891775
F1 Score: 0.666005737655354


## Random Forest

In [None]:
rf = RandomForestClassifier()
rf_metrics = perform_ten_fold_cv(rf, X, Y)

In [None]:
print_metrics(rf_metrics)

Accuracy: 0.853103448275862
Precision: 0.7469200244200244
Recall: 0.7922766122766123
F1 Score: 0.7535646053293112


## Support Vector Classifier

In [None]:
svc = SVC()
svc_metrics = perform_ten_fold_cv(svc, X, Y)

In [None]:
print_metrics(svc_metrics)

Accuracy: 0.8163218390804599
Precision: 0.6759340659340658
Recall: 0.7389971139971141
F1 Score: 0.6897125719436933


# Model prediction with Exhaustive Feature Selection with 10 fold cross validation

In [None]:
def perform_exhaustive_feature_selection(model, x_data, y_data):
  efs = ExhaustiveFeatureSelector(model, max_features=len(x_data.columns), scoring='accuracy')
  efs.fit(x_data, y_data)
  selected_features = list(x_data.columns[list(efs.best_idx_)])
  print('\nBest features using Exhaustive Feature Selection with F1 score:')
  print(selected_features)
  return selected_features

## Naive Bayes

In [None]:
nb = GaussianNB()

# perform feature selection
nb_selected_features = perform_exhaustive_feature_selection(nb, X, Y)

Features: 4095/4095


Best features using Exhaustive Feature Selection with F1 score:
['age', 'ejection_fraction', 'high_blood_pressure', 'serum_sodium', 'smoking', 'time']


In [None]:
# apply the model
nb_metrics = perform_ten_fold_cv(nb, X[nb_selected_features], Y)

In [None]:
print_metrics(nb_metrics)

Accuracy: 0.829655172413793
Precision: 0.6828266178266178
Recall: 0.7523737373737374
F1 Score: 0.7057040988505519


## Logistic Regression

In [None]:
lr = LogisticRegression()
lr_selected_features = perform_exhaustive_feature_selection(lr, X, Y)

Features: 4095/4095


Best features using Exhaustive Feature Selection with F1 score:
['age', 'creatinine_phosphokinase', 'ejection_fraction', 'serum_creatinine', 'serum_sodium', 'sex', 'time']


In [None]:
# apply the model
lr_metrics = perform_ten_fold_cv(lr, X[lr_selected_features], Y)

In [None]:
print_metrics(lr_metrics)

Accuracy: 0.829655172413793
Precision: 0.694447496947497
Recall: 0.7622222222222221
F1 Score: 0.7046015876706414


## Decision Tree

In [None]:
dt = DecisionTreeClassifier()
dt_selected_features = perform_exhaustive_feature_selection(dt, X, Y)

Features: 4095/4095


Best features using Exhaustive Feature Selection with F1 score:
['diabetes', 'ejection_fraction', 'serum_creatinine', 'smoking']


In [None]:
# apply the model
dt_metrics = perform_ten_fold_cv(dt, X[dt_selected_features], Y)

In [None]:
print_metrics(dt_metrics)

Accuracy: 0.7528735632183908
Precision: 0.6167979242979242
Recall: 0.6303654678654678
F1 Score: 0.6032797251058122


## Random Forest

In [None]:
rf = RandomForestClassifier()
rf_selected_features = perform_exhaustive_feature_selection(rf, X, Y)

Features: 4095/4095


Best features using Exhaustive Feature Selection with F1 score:
['creatinine_phosphokinase', 'diabetes', 'serum_creatinine', 'serum_sodium', 'sex']


In [None]:
rf_metrics = perform_ten_fold_cv(rf, X[rf_selected_features], Y)

In [None]:
print_metrics(rf_metrics)

Accuracy: 0.7562068965517241
Precision: 0.4968315018315018
Recall: 0.695595238095238
F1 Score: 0.5576920087136805


## Support Vector Classifier

In [None]:
svc = SVC()
svc_selected_features = perform_exhaustive_feature_selection(svc, X, Y)

Features: 4095/4095


Best features using Exhaustive Feature Selection with F1 score:
['anaemia', 'diabetes', 'ejection_fraction', 'high_blood_pressure', 'serum_creatinine', 'smoking', 'time']


In [None]:
svc_metrics = perform_ten_fold_cv(svc, X[svc_selected_features], Y)

In [None]:
print_metrics(svc_metrics)

Accuracy: 0.8364367816091954
Precision: 0.7401343101343102
Recall: 0.7512626262626263
F1 Score: 0.7315795315795315


# Model prediction with Backward Feature Elimination with 10 fold cross validation

In [None]:
def perform_sequential_feature_selector(model, x_data, y_data):
  sbs = SequentialFeatureSelector(nb, k_features='best', forward=False, floating=False, scoring='accuracy')
  sbs.fit(x_data.to_numpy(), y_data.to_numpy())

  selected_features = list(x_data.columns[list(sbs.k_feature_idx_)])
  print('\nBest features using Exhaustive Feature Selection with F1 score:')
  print(selected_features)
  return selected_features

## Naive Bayes

In [None]:
nb = GaussianNB()
nb_sf = perform_sequential_feature_selector(nb, X, Y)


Best features using Exhaustive Feature Selection with F1 score:
['age', 'ejection_fraction', 'time']


In [None]:
nb_metrics = perform_ten_fold_cv(nb, X[nb_sf], Y)

In [None]:
print_metrics(nb_metrics)

Accuracy: 0.8463218390804599
Precision: 0.7083302808302808
Recall: 0.7804473304473305
F1 Score: 0.7346872171814962


## Logistic Regression

In [None]:
lr = LogisticRegression()
lr_sf = perform_sequential_feature_selector(lr, X, Y)


Best features using Exhaustive Feature Selection with F1 score:
['age', 'ejection_fraction', 'time']


In [None]:
lr_metrics = perform_ten_fold_cv(lr, X[lr_sf], Y)

In [None]:
print_metrics(lr_metrics)

Accuracy: 0.8262068965517242
Precision: 0.655399877899878
Recall: 0.7436075036075036
F1 Score: 0.6895198883242362


## Decision Tree

In [None]:
dt = DecisionTreeClassifier()
dt_sf = perform_sequential_feature_selector(lr, X, Y)


Best features using Exhaustive Feature Selection with F1 score:
['age', 'ejection_fraction', 'time']


In [None]:
dt_metrics = perform_ten_fold_cv(dt, X[dt_sf], Y)

In [None]:
print_metrics(dt_metrics)

Accuracy: 0.7995402298850575
Precision: 0.7306043956043956
Recall: 0.6689998889998889
F1 Score: 0.6867975437314514


## Random Forest

In [None]:
rf = RandomForestClassifier()
rf_sf = perform_sequential_feature_selector(rf, X, Y)


Best features using Exhaustive Feature Selection with F1 score:
['age', 'ejection_fraction', 'time']


In [None]:
rf_metrics = perform_ten_fold_cv(rf, X[rf_sf], Y)

In [None]:
print_metrics(rf_metrics)

Accuracy: 0.8364367816091954
Precision: 0.7266422466422466
Recall: 0.7666883116883116
F1 Score: 0.73310088888094


## Support Vector Classifier

In [None]:
svc = SVC()
svc_sf = perform_sequential_feature_selector(svc, X, Y)


Best features using Exhaustive Feature Selection with F1 score:
['age', 'ejection_fraction', 'time']


In [None]:
svc_metrics = perform_ten_fold_cv(svc, X[svc_sf], Y)

In [None]:
print_metrics(svc_metrics)

Accuracy: 0.8429885057471264
Precision: 0.6846123321123321
Recall: 0.7878066378066378
F1 Score: 0.7219116632160111
