In [1]:
 # Mounting Google Drive for access to files
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Importing Libraries

In [2]:
import pandas as pd  # Data manipulation and analysis
import numpy as np  # Numerical operations on arrays
import matplotlib.pyplot as plt  # Plotting functionality
import seaborn as sns  # Statistical data visualization
import plotly.express as px  # Interactive plotting

# Datasets

In [3]:
# get file id to share publically
!gdown 1pwezEtuH709qhvlxkUdspc9_jM6gijtk

Downloading...
From: https://drive.google.com/uc?id=1pwezEtuH709qhvlxkUdspc9_jM6gijtk
To: /content/cardio_preprocessed
  0% 0.00/4.60M [00:00<?, ?B/s]100% 4.60M/4.60M [00:00<00:00, 17.5MB/s]100% 4.60M/4.60M [00:00<00:00, 17.5MB/s]


In [4]:
cardio = pd.read_csv('/content/cardio_preprocessed')

In [5]:
cardio.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,bmi,age_group,bp_group
0,50.0,2,168.0,62.0,110,80.0,1,1,0,0,1,0,21.96712,2.0,1.0
1,55.0,1,156.0,85.0,140,90.0,3,1,0,0,1,1,34.927679,2.0,2.0
2,52.0,1,165.0,64.0,130,70.0,3,1,0,0,0,1,23.507805,2.0,2.0
3,48.0,2,169.0,82.0,150,100.0,1,1,0,0,1,1,28.710479,2.0,2.0
4,48.0,1,156.0,56.0,100,65.0,1,1,0,0,0,0,23.011177,2.0,1.0


In [6]:
cardio.shape

(69976, 15)

# Supervised Learning - Classification

Input features $(X)$ → Output class $(y)$

In [7]:
# Features attributes
X = cardio.drop(['cardio'], axis='columns')

# Target attribute 'cardio'
y = cardio['cardio']

In [8]:
print(X.shape)
print(y.shape)

(69976, 14)
(69976,)


# Machine Learning Models

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

models = {
    'Logistic Regression': LogisticRegression(max_iter = 10000),
    'Random Forest': RandomForestClassifier(),
    'Gaussian Naive Bayes': GaussianNB(),
    # 'XGB Classifier': XGBClassifier(random_state=42),
    'KNN': KNeighborsClassifier(),
    'Support Vector Classifier': SVC(),
}

# Evaluation

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Cross Validation


In [12]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Dictionary to store results
results_dict = {'Model': [], 'Accuracy': [], 'Precision': [], 'Recall': [], 'F1 Score': [], 'ROC AUC': []}

# Define scoring metrics
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1_score': make_scorer(f1_score, average='weighted'),
    'roc_auc': make_scorer(roc_auc_score)
}

# Perform cross-validation for each model
for model_name, model in models.items():
    cv_results = cross_validate(model, X_train, y_train, scoring=scoring, cv=5)

    results_dict['Model'].append(model_name)
    results_dict['Accuracy'].append(cv_results['test_accuracy'].mean())
    results_dict['Precision'].append(cv_results['test_precision'].mean())
    results_dict['Recall'].append(cv_results['test_recall'].mean())
    results_dict['F1 Score'].append(cv_results['test_f1_score'].mean())
    results_dict['ROC AUC'].append(cv_results['test_roc_auc'].mean())

# Create and display the results DataFrame
results_df = pd.DataFrame(results_dict)

In [32]:
results_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC AUC
0,Logistic Regression,0.725491,0.727213,0.725491,0.724889,0.725307
1,Random Forest,0.711486,0.711574,0.711486,0.711436,0.71144
2,Gaussian Naive Bayes,0.722615,0.724341,0.722615,0.721997,0.72243
4,KNN,0.689586,0.689885,0.689586,0.689413,0.689498
5,Support Vector Classifier,0.722258,0.729675,0.722258,0.719824,0.721886


In [33]:
def df_latex(df):
    print(df.to_latex(index=False, bold_rows = True, formatters={"name": str.upper}, float_format="{:.4f}".format,))

In [34]:
df_latex(results_df)

\begin{tabular}{lrrrrr}
\toprule
                    Model &  Accuracy &  Precision &  Recall &  F1 Score &  ROC AUC \\
\midrule
      Logistic Regression &    0.7255 &     0.7272 &  0.7255 &    0.7249 &   0.7253 \\
            Random Forest &    0.7115 &     0.7116 &  0.7115 &    0.7114 &   0.7114 \\
     Gaussian Naive Bayes &    0.7226 &     0.7243 &  0.7226 &    0.7220 &   0.7224 \\
                      KNN &    0.6896 &     0.6899 &  0.6896 &    0.6894 &   0.6895 \\
Support Vector Classifier &    0.7223 &     0.7297 &  0.7223 &    0.7198 &   0.7219 \\
\bottomrule
\end{tabular}



  print(df.to_latex(index=False, bold_rows = True, formatters={"name": str.upper}, float_format="{:.4f}".format,))


# Hyperparameter Tuning

Due to resource constraints, we initially employed cross-validation to identify the best model using default hyperparameters. Subsequently, hyperparameter tuning was exclusively performed on the selected optimal model, which was found to be logistic regression.

In [36]:
# parameters = {
#     'LogisticRegression': {
#         'model': LogisticRegression(solver='liblinear', multi_class='auto', max_iter = 5000),
#         'params': {
#             'penalty': ['l1', 'l2'],
#             'C': [0.001, 0.01, 0.1, 1, 5, 10],
#             }
#     },
# }

# from sklearn.model_selection import GridSearchCV

# score = []
# for i, j in parameters.items():
#     model = GridSearchCV(j['model'], j['params'], cv = 5, return_train_score = False)
#     model.fit(X_train, y_train)
#     score.append({
#         'Model': i,
#         'Best_score': model.best_score_,
#         'Best_params': model.best_params_
#     })

# results = pd.DataFrame(score)
# print(results)

## Logistic Regression

In [38]:
logistic = LogisticRegression(max_iter = 10000)
logistic.fit(X_train, y_train)

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score


# Make predictions on the test set
y_pred = logistic.predict(X_test)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Classification Report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

# Accuracy Score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy Score:", accuracy)


Confusion Matrix:
 [[5332 1569]
 [2288 4807]]
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.77      0.73      6901
           1       0.75      0.68      0.71      7095

    accuracy                           0.72     13996
   macro avg       0.73      0.73      0.72     13996
weighted avg       0.73      0.72      0.72     13996

Accuracy Score: 0.7244212632180623


# Neural Network

In [31]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

# Define the model
model = Sequential([
    Dense(units=32, activation='relu', input_dim = X_train.shape[1]),
    Dense(units=16, activation='relu'),
    Dense(units=1, activation='sigmoid'),
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Define early stopping callback
early_stopping = EarlyStopping(monitor='loss', patience=10, restore_best_weights=True)

# Train the model
model.fit(X_train, y_train, epochs=100, batch_size=64, callbacks=[early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100


<keras.callbacks.History at 0x7cb40c679f30>

In [None]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')

Test Loss: 0.545889139175415, Test Accuracy: 0.7290654182434082
