## Cardiovascular Diesase Prediction

Dataset source : https://www.kaggle.com/datasets/sulianova/cardiovascular-disease-dataset

## Importing Libraries

In [1]:
import pandas as pd   
import matplotlib.pyplot as plt
import plotly.express as px
import time
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, RepeatedKFold, GridSearchCV, cross_val_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import KFold, cross_val_score

from sklearn.pipeline import Pipeline
from matplotlib import pyplot
import seaborn as sns


from numpy import mean
import warnings
warnings.filterwarnings("ignore") 

## Loading Dataset using Pandas

In [2]:
data = pd.read_csv("E:/cardio_disease.csv", sep=';')
data.shape

(70000, 13)

## Data Pre Processing

In [3]:
data.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [4]:
# Counting Number of distribution.
dictribution = data["cardio"].value_counts()
dictribution

0    35021
1    34979
Name: cardio, dtype: int64

In [5]:
# Checking Shape of the dataset.
print('Length of Dataset: ', data.shape[0])
print('Number of Attributes: ', data.shape[1])

Length of Dataset:  70000
Number of Attributes:  13


In [6]:
# Checking Null values in all features.
data.isnull().sum()

id             0
age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64

In [7]:
# Checking Data Types of every column.
data.dtypes

id               int64
age              int64
gender           int64
height           int64
weight         float64
ap_hi            int64
ap_lo            int64
cholesterol      int64
gluc             int64
smoke            int64
alco             int64
active           int64
cardio           int64
dtype: object

In [8]:
# Dropping Unnecessary Column.
data.drop('id',axis=1,inplace=True)

## Splitting Data into Train-Test Split

In [9]:
X = data.loc[:, data.columns != 'cardio']     # All columns except target variable.
y = data[['cardio']]                          # Target Variable.

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2)

In [11]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(52500, 11)
(52500, 1)
(17500, 11)
(17500, 1)


## Machine Learning Modelling

### Gradient Boosting Classifier

In [15]:
gb = GradientBoostingClassifier(learning_rate=0.01, max_depth=5, n_estimators=400)
gb.fit(X_train,y_train.values.ravel())
y_pred = gb.predict(X_test)

In [18]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)
print("Confusion Matrix:")
print(cm)
print('------------------------------------')

AUC_Score = roc_auc_score(y_test,y_pred)
print("AUC Score:",AUC_Score)
print('------------------------------------')


accuracy = gb.score(X_test, y_test)
print("Accuracy of Random Forest:",accuracy*100,"%")
print('------------------------------------')


from sklearn.metrics import recall_score
recall = recall_score(y_test,y_pred)
print("Recall:",recall)
print('------------------------------------')

from sklearn.metrics import precision_score
precision = precision_score(y_test,y_pred)
print("Precision:",precision)
print('------------------------------------')

specificity = cm[0,0]/(cm[0,0]+cm[0,1])
print("Specificity:", specificity)
print('------------------------------------')

sensitivity = cm[1,1]/(cm[1,1]+cm[1,0])
print("Sensitivity:", sensitivity)

print('------------------------------------')
miss_rate = cm[0,1]/(cm[0,1]+cm[0,0])
print("Miss Rate (False Positive Rate):", miss_rate)

print('------------------------------------')
miss_rate_FNR = cm[1,0]/(cm[1,0]+cm[1,1])
print("Miss Rate (False Negative Rate):", miss_rate_FNR)

print(classification_report(y_test,y_pred))

Confusion Matrix:
[[6866 1894]
 [2768 5972]]
------------------------------------
AUC Score: 0.7335425744229543
------------------------------------
Accuracy of Random Forest: 73.36 %
------------------------------------
Recall: 0.6832951945080091
------------------------------------
Precision: 0.7592168827866769
------------------------------------
Specificity: 0.7837899543378996
------------------------------------
Sensitivity: 0.6832951945080091
------------------------------------
Miss Rate (False Positive Rate): 0.21621004566210045
------------------------------------
Miss Rate (False Negative Rate): 0.31670480549199087
              precision    recall  f1-score   support

           0       0.71      0.78      0.75      8760
           1       0.76      0.68      0.72      8740

    accuracy                           0.73     17500
   macro avg       0.74      0.73      0.73     17500
weighted avg       0.74      0.73      0.73     17500



### Naive Bayes Classifier

In [19]:
NB = GaussianNB()
NB.fit(X_train,y_train.values.ravel())
y_pred = NB.predict(X_test)

In [26]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)
print("Confusion Matrix:")
print(cm)
print('------------------------------------')

AUC_Score = roc_auc_score(y_test,y_pred)
print("AUC Score:",AUC_Score)
print('------------------------------------')


accuracy = NB.score(X_test, y_test)
print("Accuracy of Naive Bayes:",accuracy*100,"%")
print('------------------------------------')


from sklearn.metrics import recall_score
recall = recall_score(y_test,y_pred)
print("Recall:",recall)
print('------------------------------------')

from sklearn.metrics import precision_score
precision = precision_score(y_test,y_pred)
print("Precision:",precision)
print('------------------------------------')

specificity = cm[0,0]/(cm[0,0]+cm[0,1])
print("Specificity:", specificity)
print('------------------------------------')

sensitivity = cm[1,1]/(cm[1,1]+cm[1,0])
print("Sensitivity:", sensitivity)

print('------------------------------------')
miss_rate = cm[0,1]/(cm[0,1]+cm[0,0])
print("Miss Rate (False Positive Rate):", miss_rate)

print('------------------------------------')
miss_rate_FNR = cm[1,0]/(cm[1,0]+cm[1,1])
print("Miss Rate (False Negative Rate):", miss_rate_FNR)

from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

Confusion Matrix:
[[6126 2634]
 [2921 5819]]
------------------------------------
AUC Score: 0.6825522710886806
------------------------------------
Accuracy of Naive Bayes: 58.502857142857145 %
------------------------------------
Recall: 0.6657894736842105
------------------------------------
Precision: 0.6883946527859931
------------------------------------
Specificity: 0.6993150684931507
------------------------------------
Sensitivity: 0.6657894736842105
------------------------------------
Miss Rate (False Positive Rate): 0.30068493150684933
------------------------------------
Miss Rate (False Negative Rate): 0.33421052631578946
              precision    recall  f1-score   support

           0       0.68      0.70      0.69      8760
           1       0.69      0.67      0.68      8740

    accuracy                           0.68     17500
   macro avg       0.68      0.68      0.68     17500
weighted avg       0.68      0.68      0.68     17500



### K- Nearest Neighbor

In [22]:
#Note: .values.ravel( ) is used to eliminate dataconversion warning.
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train,y_train.values.ravel())
y_pred = knn.predict(X_test)

In [25]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)
print("Confusion Matrix:")
print(cm)
print('------------------------------------')

AUC_Score = roc_auc_score(y_test,y_pred)
print("AUC Score:",AUC_Score)
print('------------------------------------')


accuracy = knn.score(X_test, y_test)
print("Accuracy of KNN Classifier:",accuracy*100,"%")
print('------------------------------------')


from sklearn.metrics import recall_score
recall = recall_score(y_test,y_pred)
print("Recall:",recall)
print('------------------------------------')

from sklearn.metrics import precision_score
precision = precision_score(y_test,y_pred)
print("Precision:",precision)
print('------------------------------------')

specificity = cm[0,0]/(cm[0,0]+cm[0,1])
print("Specificity:", specificity)
print('------------------------------------')

sensitivity = cm[1,1]/(cm[1,1]+cm[1,0])
print("Sensitivity:", sensitivity)

print('------------------------------------')
miss_rate = cm[0,1]/(cm[0,1]+cm[0,0])
print("Miss Rate (False Positive Rate):", miss_rate)

print('------------------------------------')
miss_rate_FNR = cm[1,0]/(cm[1,0]+cm[1,1])
print("Miss Rate (False Negative Rate):", miss_rate_FNR)

from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

Confusion Matrix:
[[6126 2634]
 [2921 5819]]
------------------------------------
AUC Score: 0.6825522710886806
------------------------------------
Accuracy of KNN Classifier: 68.25714285714287 %
------------------------------------
Recall: 0.6657894736842105
------------------------------------
Precision: 0.6883946527859931
------------------------------------
Specificity: 0.6993150684931507
------------------------------------
Sensitivity: 0.6657894736842105
------------------------------------
Miss Rate (False Positive Rate): 0.30068493150684933
------------------------------------
Miss Rate (False Negative Rate): 0.33421052631578946
              precision    recall  f1-score   support

           0       0.68      0.70      0.69      8760
           1       0.69      0.67      0.68      8740

    accuracy                           0.68     17500
   macro avg       0.68      0.68      0.68     17500
weighted avg       0.68      0.68      0.68     17500

