In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from scipy.stats import shapiro

import warnings
warnings.filterwarnings("ignore")

# Display all rows and columns of a dataframe instead of a truncated version
from IPython.display import display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
df = pd.read_csv('../Data/new_data.csv')
df.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,BMI
0,50,2,168,62,110,80,1,1,0,0,1,0,21
1,55,1,156,85,140,90,3,1,0,0,1,1,34
2,51,1,165,64,130,70,3,1,0,0,0,1,23
3,48,2,169,82,150,100,1,1,0,0,1,1,28
4,47,1,156,56,100,60,1,1,0,0,0,0,23


### Checking if the dataset is Guassian or not

In [3]:
# perform the Shapiro-Wilk test
stat, p = shapiro(df)

# interpret the results
alpha = 0.05
if p > alpha:
    print('Sample looks Gaussian (fail to reject H0)')
else:
    print('Sample does not look Gaussian (reject H0)')

Sample does not look Gaussian (reject H0)


In [4]:
# # loop through each variable in the dataset
# for col in df.columns:
#     if df[col].dtype == 'int64':  # check if the variable is integer type
#         # plot a histogram of the variable
#         plt.figure(figsize=(10, 5))
#         plt.hist(df[col], bins=20)
#         plt.title(col + ' Histogram')
#         plt.show()
#         # plot a Q-Q plot of the variable
#         plt.figure(figsize=(10, 5))
#         stats.probplot(df[col], dist="norm", plot=plt)
#         plt.title(col + ' Q-Q Plot')
#         plt.show()

----
## Machine Learning

### Train | Test Split

In [17]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, plot_confusion_matrix

In [6]:
X = df.drop('cardio', axis=1)
y = df['cardio']

In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data using standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Scale the data using normalization
normalizer = MinMaxScaler()
X_train_norm = normalizer.fit_transform(X_train)
X_test_norm = normalizer.transform(X_test)

### Naive Bayes Classifier

In [8]:
# Fit and evaluate Naive Bayes classifier
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)
acc_gnb = accuracy_score(y_test, y_pred_gnb)
print('Accuracy of Naive Bayes:', acc_gnb)

Accuracy of Naive Bayes: 0.6000714285714286


In [18]:
scores = cross_val_score(gnb, X_train, y_train, cv=5, scoring='accuracy')

# Report the results
print(f"Accuracy: {scores.mean():.2%} (+/- {scores.std() * 2:.2%})")

Accuracy: 60.35% (+/- 1.71%)


In [9]:
cm_gnb = confusion_matrix(y_test, y_pred_gnb)
print('Confusion matrix of Naive Bayes:\n', cm_gnb)

Confusion matrix of Naive Bayes:
 [[5982 1006]
 [4593 2419]]


### Random Forest classifier

In [10]:
# Fit and evaluate Random Forest classifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_norm, y_train)
y_pred_rf = rf.predict(X_test_norm)
acc_rf = accuracy_score(y_test, y_pred_rf)
print('Accuracy of Random Forest:', acc_rf)

Accuracy of Random Forest: 0.7101428571428572


In [20]:
scores = cross_val_score(rf, X_train_norm, y_train, cv=5, scoring='accuracy')

# Report the results
print(f"Accuracy: {scores.mean():.2%} (+/- {scores.std() * 2:.2%})")

Accuracy: 70.78% (+/- 0.86%)


In [11]:
cm_rf = confusion_matrix(y_test, y_pred_rf)
print('Confusion matrix of Random Forest:\n', cm_rf)

Confusion matrix of Random Forest:
 [[4970 2018]
 [2040 4972]]


**This means that the model correctly predicted 4970 instances of the negative class (no cardiovascular disease) and 4972 instances of the positive class (cardiovascular disease). However, it incorrectly predicted 2018 instances as positive (false positive) and 2040 instances as negative (false negative).**

In [15]:
print(classification_report(y_test,y_pred_rf))

              precision    recall  f1-score   support

           0       0.71      0.71      0.71      6988
           1       0.71      0.71      0.71      7012

    accuracy                           0.71     14000
   macro avg       0.71      0.71      0.71     14000
weighted avg       0.71      0.71      0.71     14000



### K-Nearest Neighbors classifier

In [12]:
# Fit and evaluate K-Nearest Neighbors classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_norm, y_train)
y_pred_knn = knn.predict(X_test_norm)
acc_knn = accuracy_score(y_test, y_pred_knn)
print('Accuracy of K-Nearest Neighbors:', acc_knn)

Accuracy of K-Nearest Neighbors: 0.6071428571428571


In [13]:
cm_knn = confusion_matrix(y_test, y_pred_knn)
print('Confusion matrix of K-Nearest Neighbors:\n', cm_knn)

Confusion matrix of K-Nearest Neighbors:
 [[4238 2750]
 [2750 4262]]
