In [17]:
import pandas as pd 
import numpy as np 
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance

In [2]:
data = pd.read_csv("cleaned_data_v2.csv")
data.head()

Unnamed: 0,id,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years,BMI,BP Category
0,0,M,168,62.0,110.0,80.0,1,1,0,0,1,0,50,21.96712,Normal
1,1,F,156,85.0,140.0,90.0,3,1,0,0,1,1,55,34.927679,High Blood Pressure Stage 2
2,2,F,165,64.0,130.0,70.0,3,1,0,0,0,1,51,23.507805,High Blood Pressure Stage 1
3,3,M,169,82.0,150.0,100.0,1,1,0,0,1,1,48,28.710479,High Blood Pressure Stage 2
4,4,F,156,56.0,100.0,60.0,1,1,0,0,0,0,47,23.011177,Normal


In [3]:
X = data.drop(columns = ["cardio", "id"]) 

In [4]:
y = data["cardio"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [6]:
X_train.select_dtypes(exclude="number")

Unnamed: 0,gender,BP Category
47339,F,High Blood Pressure Stage 1
67456,F,High Blood Pressure Stage 2
12308,F,Normal
32557,F,High Blood Pressure Stage 1
664,M,Normal
...,...,...
37194,F,High Blood Pressure Stage 2
6265,M,Normal
54886,M,Normal
860,F,Normal


In [7]:
clf_v1 = HistGradientBoostingClassifier(categorical_features = ["gender", "BP Category"], random_state=42).fit(X_train, y_train)

In [8]:
clf_v1.score(X_test, y_test) # returns accuracy

0.7342857142857143

In [9]:
print(classification_report(y_test, clf_v1.predict(X_test)))

              precision    recall  f1-score   support

           0       0.72      0.77      0.74      6968
           1       0.75      0.70      0.73      7032

    accuracy                           0.73     14000
   macro avg       0.74      0.73      0.73     14000
weighted avg       0.74      0.73      0.73     14000



In [10]:
imp = permutation_importance(clf_v1, X_test, y_test, n_repeats=10, random_state=42)

In [11]:
imp.importances_mean

array([ 5.07142857e-04, -7.28571429e-04,  1.11428571e-03,  1.48771429e-01,
        2.05000000e-03,  3.35000000e-02,  5.07142857e-04,  5.92857143e-04,
       -9.28571429e-05,  3.07142857e-03,  2.72000000e-02,  1.11428571e-03,
        5.85714286e-04])

In [12]:
df_importances = pd.DataFrame(imp.importances.transpose(),
                             columns=list(X.columns))

In [13]:
#df_importances.to_csv("feature_importances.csv", index=False)

In [14]:
imp.importances_std

array([0.00035492, 0.00046708, 0.00077077, 0.00399116, 0.00113031,
       0.0015263 , 0.00026812, 0.00039389, 0.00032111, 0.00072281,
       0.00210316, 0.00086791, 0.00044584])

In [25]:
# https://stackoverflow.com/questions/7267226/range-for-floats
param_grid = {
    'learning_rate': [x / 10.0 for x in range(1, 10, 1)],
    'max_iter': list(range(50, 150, 10)),
    'max_leaf_nodes': list(range(25, 37, 1))
}

In [27]:
hgbc = HistGradientBoostingClassifier(categorical_features = ["gender", "BP Category"], random_state=42)
grid_results = GridSearchCV(hgbc, param_grid)
grid_results.fit(X_train, y_train)

In [28]:
print("Best tuned parameters: ", grid_results.best_params_)
print("Best tuned score: ", grid_results.best_score_)

Best tuned parameters:  {'learning_rate': 0.1, 'max_iter': 50, 'max_leaf_nodes': 29}
Best tuned score:  0.737


In [29]:
hgbc_v2 = hgbc = HistGradientBoostingClassifier(categorical_features = ["gender", "BP Category"], random_state=42,
                                               max_iter = 50, max_leaf_nodes = 29)

In [30]:
hgbc_v2.fit(X_train, y_train)

In [32]:
hgbc_v2.score(X_test, y_test)

0.736

In [33]:
print(classification_report(y_test, hgbc_v2.predict(X_test)))

              precision    recall  f1-score   support

           0       0.72      0.77      0.74      6968
           1       0.76      0.70      0.73      7032

    accuracy                           0.74     14000
   macro avg       0.74      0.74      0.74     14000
weighted avg       0.74      0.74      0.74     14000

