In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from collections import Counter
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix

In [3]:
heart_df=pd.read_csv("resources/cleaner_bmi.csv")
heart_df.head()

Unnamed: 0,id,age (years),gender,BMI,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,50,2,22.015308,110,80,1,1,0,0,1,0
1,1,55,1,34.850994,140,90,3,1,0,0,1,1
2,2,51,1,23.489511,130,70,3,1,0,0,0,1
3,3,48,2,28.742724,150,100,1,1,0,0,1,1
4,4,47,1,22.923381,100,60,1,1,0,0,0,0


In [4]:
heart_df.columns

Index(['id', 'age (years)', 'gender', 'BMI', 'ap_hi', 'ap_lo', 'cholesterol',
       'gluc', 'smoke', 'alco', 'active', 'cardio'],
      dtype='object')

In [5]:
y = heart_df["cardio"]
X = heart_df.drop(["id","cardio"], axis=1)

In [6]:
heart_df["cardio"].value_counts()

0    33661
1    32178
Name: cardio, dtype: int64

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)
X_train.shape

(49379, 10)

In [8]:
# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# Choose a learning rate and create classifier
classifier = GradientBoostingClassifier(n_estimators=10,
                                        learning_rate=0.5,
                                        max_features=5,
                                        max_depth=5,
                                        random_state=0)

# Fit the model
classifier.fit(X_train, y_train)

# Make Prediction
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).head(10)

Unnamed: 0,Prediction,Actual
815,1,1
7536,1,1
19670,0,1
25782,0,1
13329,0,1
31379,1,1
1701,1,1
21718,1,1
25078,1,1
48149,1,1


In [10]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.7416160388821386


In [11]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"]
)

# Displaying results
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,6706,1696
Actual 1,2557,5501


In [12]:
# Generate classification report
print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

           0       0.72      0.80      0.76      8402
           1       0.76      0.68      0.72      8058

    accuracy                           0.74     16460
   macro avg       0.74      0.74      0.74     16460
weighted avg       0.74      0.74      0.74     16460



In [13]:
# Calculate feature importance in the Gradient Boosting.
importances = classifier.feature_importances_
importances

array([0.1204525 , 0.00097065, 0.02733458, 0.72189056, 0.02152364,
       0.0848113 , 0.00770463, 0.00440655, 0.00240115, 0.00850444])

In [14]:
# We can sort the features by their importance.
sorted(zip(classifier.feature_importances_, X.columns), reverse=True)

[(0.7218905593492365, 'ap_hi'),
 (0.12045249821354982, 'age (years)'),
 (0.08481130120024699, 'cholesterol'),
 (0.02733457837214497, 'BMI'),
 (0.02152364189295545, 'ap_lo'),
 (0.00850443547477467, 'active'),
 (0.0077046318524487895, 'gluc'),
 (0.0044065487236198886, 'smoke'),
 (0.0024011522938161187, 'alco'),
 (0.0009706526272070112, 'gender')]

In [15]:
# Create a Pickle file using serialization 
import pickle
pickle_out = open("GBmodel.pkl","wb")
pickle.dump(classifier, pickle_out)
pickle_out.close()