**XGBoost**

In [3]:
import pandas as pd
import numpy as np

import random

import seaborn as sns
import matplotlib.pyplot as plt

import plotly.graph_objects as go 
from plotly.subplots import make_subplots 

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

from sklearn.metrics import classification_report 
from sklearn.metrics import confusion_matrix  
from sklearn.metrics import accuracy_score

from sklearn.neighbors import KNeighborsClassifier 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.linear_model import LogisticRegression 
from xgboost import XGBClassifier
pd.set_option('display.max_columns', None)

In [None]:
data = pd.read_csv('cleaned_data_cardio_train.csv')
data.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,bmi,pulse
0,50,2,5.123964,4.127134,110,80,1,1,0,0,1,0,21.96712,30.0
1,55,1,5.049856,4.442651,140,90,3,1,0,0,1,1,34.927679,50.0
2,51,1,5.105945,4.158883,130,70,3,1,0,0,0,1,23.507805,60.0
3,48,2,5.129899,4.406719,150,100,1,1,0,0,1,1,28.710479,50.0
4,47,1,5.049856,4.025352,100,60,1,1,0,0,0,0,23.011177,40.0


In [5]:
X = data.drop(['cardio', 'gender', 'height', 'alco', 'smoke', 'active'], axis =1)
Y = data['cardio']
list(X.columns)


['age', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'bmi', 'pulse']

In [6]:
scaler = StandardScaler()
standard_X = scaler.fit_transform(X)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(standard_X, Y, test_size=0.2,
                                                    random_state=42,
                                                    shuffle=True)

In [8]:
# # Define the parameter grid to search
# param_xgb_grid_search = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [3, 4, 5],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'subsample': [0.8, 1],
#     'colsample_bytree': [0.8, 1],
#     'gamma': [0, 0.1, 0.2]
# }

param_xgb = {
    'colsample_bytree': 0.8,
    'gamma': 0,
    'learning_rate': 0.1,
    'max_depth': 4,
    'n_estimators': 100,
    'subsample': 1}

In [9]:
# # Set up GridSearchCV
# grid_search = GridSearchCV(estimator=xgb, param_grid=param_xgb, scoring='accuracy', cv=3, verbose=2, n_jobs=-1)

# # Fit the grid search to the data
# grid_search.fit(X_train, y_train)

# # Get the best parameters and model
# best_params = grid_search.best_params_
# best_model = grid_search.best_estimator_

# print("Best Parameters:", best_params)

In [10]:
xgb_model = XGBClassifier(**param_xgb)

# Train the model with the best parameters
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_xgb)
print("Accuracy on test data:", accuracy)

Accuracy on test data: 0.7264623084141383


In [11]:
# Perform cross-validation
scores = cross_val_score(xgb_model, X_train, y_train, cv=10, scoring='accuracy', n_jobs=-1)

# Print the cross-validation results
print('XGBoost Model gives an average accuracy of {0:.2f}% with a minimum of {1:.2f}% and a maximum of {2:.2f}% accuracy'.format(
    np.mean(scores) * 100, 
    np.min(scores) * 100, 
    np.max(scores) * 100
))

XGBoost Model gives an average accuracy of 73.13% with a minimum of 72.36% and a maximum of 74.47% accuracy


In [12]:
print(classification_report(y_test, y_pred_xgb))

              precision    recall  f1-score   support

           0       0.71      0.76      0.73      6307
           1       0.75      0.70      0.72      6481

    accuracy                           0.73     12788
   macro avg       0.73      0.73      0.73     12788
weighted avg       0.73      0.73      0.73     12788



**K-Nearest Neighbors**

In [13]:
# param_knn = {'n_neighbors':list(range(0, 51)),
#           'weights':['uniform', 'distance'],
#           'p':[1,2]}

param_knn = {
    'n_neighbors': 50,
    'p': 1,
    'weights': 'uniform'
    }

In [14]:
# knn = KNeighborsClassifier()
# knn_grid_cv = GridSearchCV(knn, param_grid=param_knn, cv=10) 
# knn_grid_cv.fit(X_train, y_train)
# print("Best Hyper Parameters:\n",knn_grid_cv.best_params_)

# print("Best Hyper Parameters: {'n_neighbors': 50, 'p': 1, 'weights': 'uniform'}")

In [15]:
knn_model = KNeighborsClassifier(**param_knn)
knn_model.fit(X_train, y_train) 

In [16]:
y_pred_knn = knn_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_knn)
print("Accuracy on test data:", accuracy)

Accuracy on test data: 0.7213794182045667


In [17]:
# Perform cross-validation
scores = cross_val_score(knn_model, X_train, y_train, cv=10, scoring='accuracy', n_jobs=-1)

# Print the cross-validation results
print('KNN Model gives an average accuracy of {0:.2f}% with a minimum of {1:.2f}% and a maximum of {2:.2f}% accuracy'.format(
    np.mean(scores) * 100, 
    np.min(scores) * 100, 
    np.max(scores) * 100
))

KNN Model gives an average accuracy of 72.58% with a minimum of 71.83% and a maximum of 73.80% accuracy


In [18]:
print(classification_report(y_test, y_pred_knn))

              precision    recall  f1-score   support

           0       0.70      0.77      0.73      6307
           1       0.75      0.67      0.71      6481

    accuracy                           0.72     12788
   macro avg       0.72      0.72      0.72     12788
weighted avg       0.72      0.72      0.72     12788



**Random Forest**

In [19]:
# param_rf = { 
#     'n_estimators': [10, 50, 100, 150, 200, 300, 400, 500],
#     'max_depth' : [10,20,30,40,50],
#     'criterion' : ['entropy','gini']
# }

param_rfc = {
    'criterion': 'entropy',
    'max_depth': 10,
    'n_estimators': 100
    }

In [20]:
# # rfc_gridcv = RandomForestClassifier(random_state=42)
# # rfc_gridcv = GridSearchCV(estimator=rfc_gridcv, param_grid=param_rf, cv= 10, n_jobs = -1)
# # rfc_gridcv.fit(X_train, y_train)
# # print("Best Hyper Parameters:\n",rfc_gridcv.best_params_)

# print("Best Hyper Parameters:{'criterion': 'entropy', 'max_depth': 10, 'n_estimators': 100}")

In [21]:
rfc_model = RandomForestClassifier(random_state=42, **param_rfc)
rfc_model.fit(X_train, y_train)

In [22]:
y_pred_rfc = rfc_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_rfc)
print("Accuracy on test data:", accuracy)

Accuracy on test data: 0.7225523928683141


In [23]:
# Perform cross-validation
scores = cross_val_score(rfc_model, X_train, y_train, cv=10, scoring='accuracy', n_jobs=-1)

# Print the cross-validation results
print('Random Forest Classifier Model gives an average accuracy of {0:.2f}% with a minimum of {1:.2f}% and a maximum of {2:.2f}% accuracy'.format(
    np.mean(scores) * 100, 
    np.min(scores) * 100, 
    np.max(scores) * 100
))

Random Forest Classifier Model gives an average accuracy of 73.02% with a minimum of 72.14% and a maximum of 74.80% accuracy


In [24]:
print(classification_report(y_test, y_pred_rfc))

              precision    recall  f1-score   support

           0       0.70      0.77      0.73      6307
           1       0.75      0.68      0.71      6481

    accuracy                           0.72     12788
   macro avg       0.72      0.72      0.72     12788
weighted avg       0.73      0.72      0.72     12788



**Logistic Regression**

In [25]:
# params_for_l1 = { 
#     'C' :  np.logspace(0, 4, 10),
#     'solver' : ['liblinear', 'saga']
# }

# params_for_l2 = { 
#     'C' :  np.logspace(0, 4, 10),
#     'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
# }

# params_for_elasticnet = { 
#     'C' :  np.logspace(0, 4, 10),
#     'l1_ratio' : np.arange (0.1, 1.0, 0.1),
#     'solver' : ['saga']
# }

In [26]:
# logreg_with_l1_gridcv = LogisticRegression(penalty = 'l1')
# logreg_with_l1_gridcv = GridSearchCV(estimator=logreg_with_l1_gridcv, param_grid=params_for_l1, cv= 10, n_jobs = -1)
# logreg_with_l1_gridcv.fit(X_train, y_train)
# print("Best Hyper Parameters:\n",logreg_with_l1_gridcv.best_params_)

# print("Best Hyper Parameters:{'C': 166.81005372000593, 'solver': 'saga'}")

In [27]:
# logreg_with_l1 = LogisticRegression(penalty = 'l1', C = 166.81005372000593, solver = 'saga')
# logreg_with_l1.fit(X_train, y_train)

In [28]:
# logreg_with_l2_gridcv = LogisticRegression(penalty = 'l2')
# logreg_with_l2_gridcv = GridSearchCV(estimator=logreg_with_l2_gridcv, param_grid=params_for_l2, cv= 10, n_jobs = -1)
# logreg_with_l2_gridcv.fit(X_train, y_train)
# print("Best Hyper Parameters:\n",logreg_with_l2_gridcv.best_params_)

# print("Best Hyper Parameters:{'C': 1.0, 'solver': 'liblinear'}")

In [29]:
# logreg_with_l2 = LogisticRegression(penalty = 'l2', C = 1.0, solver = 'liblinear')
# logreg_with_l2.fit(X_train, y_train)

In [30]:
'''logreg_with_elasticnet_gridcv = LogisticRegression(penalty = 'elasticnet')
logreg_with_elasticnet_gridcv = GridSearchCV(estimator=logreg_with_elasticnet_gridcv, param_grid=params_for_elasticnet, cv= 10, n_jobs = -1)
logreg_with_elasticnet_gridcv.fit(X_train, y_train)
print("Best Hyper Parameters:\n",logreg_with_elasticnet_gridcv.best_params_)'''


# print("Best Hyper Parameters:{'C': 1291.5496650148827, 'l1_ratio': 0.6, 'solver': 'saga'}")

'logreg_with_elasticnet_gridcv = LogisticRegression(penalty = \'elasticnet\')\nlogreg_with_elasticnet_gridcv = GridSearchCV(estimator=logreg_with_elasticnet_gridcv, param_grid=params_for_elasticnet, cv= 10, n_jobs = -1)\nlogreg_with_elasticnet_gridcv.fit(X_train, y_train)\nprint("Best Hyper Parameters:\n",logreg_with_elasticnet_gridcv.best_params_)'

In [31]:
# logreg_with_elasticnet = LogisticRegression(penalty = 'elasticnet', C = 1291.5496650148827, l1_ratio =  0.6, solver = 'saga')
# logreg_with_elasticnet.fit(X_train, y_train)

In [32]:
# scores = cross_val_score(knn, X_train, y_train, cv=10)
# print('KNN Model gives an average accuracy of {0:.2f} % with minimun of {1:.2f} % and maximum of {2:.2f} % accuracy'.format(scores.mean() * 100, scores.min() * 100, scores.max() * 100))

In [33]:
# Y_pred_knn = knn.predict(X_test)
# print(classification_report(y_test, Y_pred_knn))

In [34]:
# print('True Positive Cases : {}'.format(confusion_matrix(y_test, Y_pred_knn)[1][1]))
# print('True Negative Cases : {}'.format(confusion_matrix(y_test, Y_pred_knn)[0][0]))
# print('False Positive Cases : {}'.format(confusion_matrix(y_test, Y_pred_knn)[0][1]))
# print('False Negative Cases : {}'.format(confusion_matrix(y_test, Y_pred_knn)[1][0]))

In [35]:
# Y_pred_rfc = rfc.predict(X_test)
# print(classification_report(y_test, Y_pred_rfc))

In [36]:
# print('True Positive Cases : {}'.format(confusion_matrix(y_test, Y_pred_rfc)[1][1]))
# print('True Negative Cases : {}'.format(confusion_matrix(y_test, Y_pred_rfc)[0][0]))
# print('False Positive Cases : {}'.format(confusion_matrix(y_test, Y_pred_rfc)[0][1]))
# print('False Negative Cases : {}'.format(confusion_matrix(y_test, Y_pred_rfc)[1][0]))

In [37]:
models = {
    'xgb_model': xgb_model,
    'knn_model': knn_model,
    'rfc_model': rfc_model
}


def validate_input(prompt, value_type, min_value=None, max_value=None, choices=None):
    while True:
        try:
            user_input = value_type(input(prompt))
            if choices and user_input not in choices:
                raise ValueError("Invalid choice. Please select from the given options.")
            if (min_value is not None and user_input < min_value) or (max_value is not None and user_input > max_value):
                raise ValueError(f"Input must be between {min_value} and {max_value}.")
            return user_input
        except ValueError as e:
            print(e)



def predict_with_models(option, models, X_test, y_test, scaler):
    features = ['age', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'bmi', 'pulse']
    
    if option == 1:
        # Collect and validate user input
        print("Enter values for the following features:")
        
        age = validate_input("Age (18–120): ", int, 18, 120)
        weight = validate_input("Weight in kg (40–200): ", float, 40, 200)
        height = validate_input("Height in cm (100–250): ", float, 100, 250)  # Height needed for BMI calculation
        ap_hi = validate_input("Systolic BP (90–180): ", int, 90, 180)
        ap_lo = validate_input("Diastolic BP (60–120): ", int, 60, 120)
        cholesterol = validate_input("Cholesterol (1: normal, 2: above normal, 3: well above normal): ", int, choices=[1, 2, 3])
        gluc = validate_input("Glucose level (1: normal, 2: above normal, 3: well above normal): ", int, choices=[1, 2, 3])
        
        # Calculate bmi and pulse based on formulas
        bmi = weight / ((height / 100) ** 2)
        pulse = ap_hi - ap_lo  # Replace with specific pulse formula if needed
        
        # Display the user's observation
        observation = [age, weight, ap_hi, ap_lo, cholesterol, gluc, bmi, pulse]
        print("\nPredicted values for observation:")
        for feat, val in zip(features, observation):
            print(f"{feat}: {val}")

        # Transform user input for model compatibility
        observation = scaler.transform([observation])  # Transform without fitting again

        # Make predictions with each model
        for model_name, model in models.items():
            prediction = model.predict(observation)[0]
            diagnosis = "Had Heart Disease" if prediction == 1 else "No Heart Disease"
            print(f"{model_name} predicts: {prediction} ({diagnosis})")

    elif option == 2:
        # Prompt user for number of observations
        num_samples = validate_input("Enter the number of random observations to display: ", int, min_value=1)
        num_samples = min(num_samples, len(X_test))  # Ensure it doesn't exceed bounds
        indices = np.random.choice(len(X_test), num_samples, replace=False)

        # Create a DataFrame to hold predictions
        results = pd.DataFrame(index=[model_name for model_name in models.keys()])
        
        # Collect predictions and true labels
        true_labels = []  # Initialize a list for true labels

        for idx in indices:
            X_sample = X_test[idx].reshape(1, -1)  # Reshape for model input
            true_label = y_test.iloc[idx]  # Access using .iloc for Series
            
            # Append the true label to the list
            true_labels.append(true_label)

            # Store predictions in the DataFrame
            for model_name, model in models.items():
                prediction = model.predict(X_sample)[0]
                results.loc[model_name, f'OBS {len(true_labels)}'] = prediction  # Store predictions

        # Add true labels to the DataFrame as a new row
        results.loc['True Label'] = true_labels
        
        # Display the results
        print("Predictions for random observations:")
        print(results)

    else:
        print("Invalid option selected. Please choose 1 or 2.")

In [38]:
# Run function with desired option
option = int(input("Choose option (1 for user input, 2 for test set demo): "))
predict_with_models(option, models, X_test, y_test, scaler)

Predictions for random observations:
            OBS 1  OBS 2  OBS 3  OBS 4  OBS 5  OBS 6  OBS 7  OBS 8  OBS 9  \
xgb_model     1.0    0.0    0.0    0.0    0.0    1.0    1.0    0.0    1.0   
knn_model     0.0    0.0    0.0    0.0    0.0    1.0    1.0    1.0    1.0   
rfc_model     1.0    0.0    0.0    0.0    0.0    1.0    1.0    0.0    1.0   
True Label    1.0    0.0    0.0    0.0    0.0    1.0    0.0    1.0    1.0   

            OBS 10  
xgb_model      1.0  
knn_model      0.0  
rfc_model      0.0  
True Label     0.0  
