In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from collections import Counter
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix

In [3]:
heart_df=pd.read_csv("resources/cleaner_bmi.csv")
heart_df.head()

Unnamed: 0,id,age (years),gender,BMI,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,50,2,22.015308,110,80,1,1,0,0,1,0
1,1,55,1,34.850994,140,90,3,1,0,0,1,1
2,2,51,1,23.489511,130,70,3,1,0,0,0,1
3,3,48,2,28.742724,150,100,1,1,0,0,1,1
4,4,47,1,22.923381,100,60,1,1,0,0,0,0


In [4]:
heart_df.columns

Index(['id', 'age (years)', 'gender', 'BMI', 'ap_hi', 'ap_lo', 'cholesterol',
       'gluc', 'smoke', 'alco', 'active', 'cardio'],
      dtype='object')

In [5]:
y = heart_df["cardio"]
X = heart_df.drop(["id","cardio"], axis=1)

In [6]:
heart_df["cardio"].value_counts()

0    33661
1    32178
Name: cardio, dtype: int64

In [14]:
# 0,test_size=0.2
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)
X_train.shape

(49379, 10)

In [15]:
# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [16]:
mls = {
    "LGBMClassifier": LGBMClassifier(colsample_bytree= 0.8, learning_rate= 0.0007, max_depth= 1, min_child_weight= 4, n_estimators= 850, num_leaves= 1700),
    "RandomForestClassifier" : GridSearchCV(estimator=RandomForestClassifier(), param_grid={'n_estimators': [100, 300],'max_depth' : [5,10], 'max_features' : [4,5]}, cv=5),
    
    "LogisticRegression" : LogisticRegression(solver='lbfgs', max_iter=200, random_state=0),
    "KNeighborsClassifier" : GridSearchCV(estimator=KNN(), param_grid= {'n_neighbors' : [3,4,5]}, cv = 5),

    "SVM" : SVC(C=100, gamma=0.00001, kernel="rbf", random_state=0),
    "GradientBoostingClassifier": GridSearchCV(estimator= GradientBoostingClassifier(), 
                                            param_grid = {'n_estimators' : [10, 15],
                                                        'learning_rate' : [0.25,0.5, 0.75],
                                                        'max_features' : [4,5],
                                                        'max_depth' : [5,10]},
                                            cv=5)
}

In [17]:
all_models = []
for x in mls:
    model = mls[x]
    model.fit(X_train_scaled, y_train)
    predictions = model.predict(X_test_scaled)
    acc_score = round((accuracy_score(y_test, predictions))*100,1)
    
    all_models.append({
        "name": x,
        "accuracy": acc_score,
        
    })

    print(x)

LGBMClassifier
RandomForestClassifier
LogisticRegression
KNeighborsClassifier
SVM
GradientBoostingClassifier


In [18]:
ml_df = pd.DataFrame(all_models)
ml_df.sort_values("accuracy", ascending=False)

Unnamed: 0,name,accuracy
5,GradientBoostingClassifier,74.0
1,RandomForestClassifier,73.9
2,LogisticRegression,73.4
4,SVM,73.2
0,LGBMClassifier,71.9
3,KNeighborsClassifier,69.9


In [19]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"]
)

# Displaying results
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,6678,1724
Actual 1,2559,5499


In [20]:
# Generate classification report
print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

           0       0.72      0.79      0.76      8402
           1       0.76      0.68      0.72      8058

    accuracy                           0.74     16460
   macro avg       0.74      0.74      0.74     16460
weighted avg       0.74      0.74      0.74     16460

