In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn import linear_model
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.inspection import permutation_importance
from sklearn.model_selection import cross_val_score
from sklearn.metrics import (confusion_matrix, precision_score, recall_score,
                             accuracy_score, roc_auc_score, RocCurveDisplay, f1_score)
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import RandomOverSampler
import pickle

In [6]:
# Read data with raw preprocessing
df = pd.read_csv('heart_disease.csv')
df = df.replace({'Yes': 1, 'No': 0, 'Yes (during pregnancy)':1, 'No, borderline diabetes':0, 'Female': 1, 'Male': 0}).drop(columns=['Race', 'GenHealth','PhysicalHealth','MentalHealth'])
df['AgeCategory'] = df['AgeCategory'].apply(lambda x: x[:2]).astype(int) 
df = df.sample(frac=0.1, random_state=1)
print(df.head())
print(np.unique(df['HeartDisease'], return_counts=True))

  df = df.replace({'Yes': 1, 'No': 0, 'Yes (during pregnancy)':1, 'No, borderline diabetes':0, 'Female': 1, 'Male': 0}).drop(columns=['Race', 'GenHealth','PhysicalHealth','MentalHealth'])


        HeartDisease    BMI  Smoking  AlcoholDrinking  Stroke  DiffWalking  \
301988             0  24.30        0                0       0            0   
223127             1  23.78        1                0       1            1   
216797             0  20.60        1                0       0            0   
234217             0  28.29        1                0       0            0   
30822              1  33.00        0                0       0            0   

        Sex  AgeCategory  Diabetic  PhysicalActivity  SleepTime  Asthma  \
301988    1           40         0                 1        7.0       0   
223127    1           80         1                 0        7.0       0   
216797    1           70         0                 1        7.0       0   
234217    1           25         0                 0        4.0       0   
30822     0           75         1                 1        8.0       0   

        KidneyDisease  SkinCancer  
301988              0           0  
223127  

In [7]:
# Data Standardization, Spilit & Oversample
feature = df.drop(columns=['HeartDisease'])
X = feature.values
y = df['HeartDisease'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=1, stratify=y)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

ros = RandomOverSampler(random_state=1)
X_train, y_train = ros.fit_resample(X_train, y_train)

In [8]:
# GridSearch training
models = [svm.SVC(), linear_model.LogisticRegression(), RandomForestClassifier(), KNeighborsClassifier()]
hyperpas = [{'kernel':['linear','rbf'], 'C':[0.01, 1, 100, 1000, 5000]},
            {'max_iter':[1000, 5000, 10000], 'C':[0.01, 1, 100, 1000, 5000]},
            {'n_estimators':[1,10,30,50,100], 'max_depth':[1, 3, 5, 7, 10, 20, 50]},
            {'n_neighbors':[1, 2, 3, 4, 5, 10, 20, 50]}]
best_index = -1
best_score = -np.inf
best_clf = None
for i in range(4):
    curr_model = models[i]
    curr_clf = GridSearchCV(curr_model, hyperpas[i], scoring='f1',n_jobs=16, cv=5)
    curr_clf.fit(X_train, y_train)
    curr_score = curr_clf.best_score_
    if curr_score > best_score:
        best_index = i
        best_score = curr_score
        best_clf = curr_clf
    print(f'Current model = {curr_model}, Best score for this model = {curr_score:.3}, Parameters = {curr_clf.best_params_}')
print(f'\nBest model = {models[best_index]}, Best score = {best_score:.3}, Parameters = {best_clf.best_params_}')

Current model = SVC(), Best score for this model = 0.802, Parameters = {'C': 5000, 'kernel': 'rbf'}
Current model = LogisticRegression(), Best score for this model = 0.74, Parameters = {'C': 100, 'max_iter': 1000}
Current model = RandomForestClassifier(), Best score for this model = 0.965, Parameters = {'max_depth': 50, 'n_estimators': 10}
Current model = KNeighborsClassifier(), Best score for this model = 0.947, Parameters = {'n_neighbors': 1}

Best model = RandomForestClassifier(), Best score = 0.965, Parameters = {'max_depth': 50, 'n_estimators': 10}


  _data = np.array(data, dtype=dtype, copy=copy,


In [9]:
# Check performance on testing dataset
y_hat = best_clf.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_hat):.3}, Precision: {precision_score(y_test, y_hat):.3}, Recall: {recall_score(y_test, y_hat):.3}, F1: {f1_score(y_test, y_hat):.3}, AUC: {roc_auc_score(y_test, y_hat):.3}')

Accuracy: 0.878, Precision: 0.237, Recall: 0.194, F1: 0.213, AUC: 0.568


In [10]:
# Display feature importance
feature_importance = permutation_importance(best_clf, X_test, y_test, scoring='f1', random_state=1)
clf_feature = pd.DataFrame({'Feature':feature.columns,'Importance':feature_importance.importances_mean})
clf_feature = clf_feature.sort_values(by='Importance',ascending=False)
clf_feature

Unnamed: 0,Feature,Importance
6,AgeCategory,0.0809
3,Stroke,0.035187
4,DiffWalking,0.03485
7,Diabetic,0.024333
11,KidneyDisease,0.019632
8,PhysicalActivity,0.018288
5,Sex,0.017351
2,AlcoholDrinking,0.00636
1,Smoking,0.005748
12,SkinCancer,0.001333


In [11]:
# Save the best model locally
filename = 'pretrained_model.sav'
pickle.dump(best_clf, open(filename, 'wb'))

In [12]:
# Save the scaler
with open('scaler.sav', 'wb') as f:
    pickle.dump(scaler, f)