In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn import linear_model
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.inspection import permutation_importance
from sklearn.model_selection import cross_val_score
from sklearn.metrics import (confusion_matrix, precision_score, recall_score,
                             accuracy_score, roc_auc_score, RocCurveDisplay, f1_score)
from imblearn.over_sampling import RandomOverSampler

In [10]:
df = pd.read_csv('heart_disease.csv')
df = df.replace({'Yes': 1, 'No': 0, 'Yes (during pregnancy)':1, 'No, borderline diabetes':0, 'Female': 1, 'Male': 0}).drop(columns=['Race', 'GenHealth'])
df['AgeCategory'] = df['AgeCategory'].apply(lambda x: x[:2])
df = df.sample(frac=0.01, random_state=0)
df.head()

  df = df.replace({'Yes': 1, 'No': 0, 'Yes (during pregnancy)':1, 'No, borderline diabetes':0, 'Female': 1, 'Male': 0}).drop(columns=['Race', 'GenHealth'])


Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Diabetic,PhysicalActivity,SleepTime,Asthma,KidneyDisease,SkinCancer
126167,0,23.44,0,0,0,10.0,20.0,1,1,80,0,0,6.0,0,0,0
207506,0,32.49,0,0,0,0.0,4.0,0,0,40,0,1,8.0,0,0,0
274544,0,21.93,0,0,0,0.0,0.0,0,0,60,0,1,7.0,0,0,0
121049,0,26.58,0,0,0,0.0,2.0,0,0,45,0,1,7.0,0,0,0
260961,0,19.02,1,0,0,2.0,2.0,0,1,80,0,1,6.0,0,0,0


In [11]:
feature = df.drop(columns=['HeartDisease'])
X = feature.values
y = df['HeartDisease'] 

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=.2, random_state=0, stratify=y)

ros = RandomOverSampler(random_state=0)
X_train, y_train = ros.fit_resample(X_train, y_train)
X_test, y_test = ros.fit_resample(X_test, y_test)

In [12]:
models = [svm.SVC(), linear_model.LogisticRegression(), DecisionTreeClassifier(), KNeighborsClassifier()]
hyperpas = [{'kernel':['linear','rbf'], 'C':[0.01, 1, 100, 1000, 5000]},
            {'max_iter':[1000, 5000, 10000], 'C':[0.01, 1, 100, 1000, 5000]},
            {'criterion':['entropy'], 'max_depth':[1, 3, 5, 7, 10, 20, 50]},
            {'n_neighbors':[1, 2, 3, 4, 5, 10, 20, 50]}, {'random_state':[0]}]
best_index = -1
best_score = -np.inf
best_clf = None
for i in range(4):
    curr_model = models[i]
    curr_clf = GridSearchCV(curr_model, hyperpas[i], scoring='f1', cv=5)
    curr_clf.fit(X_train, y_train)
    curr_score = curr_clf.best_score_
    if curr_score > best_score:
        best_index = i
        best_score = curr_score
        best_clf = curr_clf
    print(f'Current model = {curr_model}, Best score for this model = {curr_score:.3}, Parameters = {curr_clf.best_params_}')
print(f'\nBest index = {best_index}, Best score = {best_score:.3}, Best classifier = {best_clf}, Parameters = {best_clf.best_params_}')

Current model = SVC(), Best score for this model = 0.922, Parameters = {'C': 5000, 'kernel': 'rbf'}
Current model = LogisticRegression(), Best score for this model = 0.796, Parameters = {'C': 100, 'max_iter': 1000}
Current model = DecisionTreeClassifier(), Best score for this model = 0.961, Parameters = {'criterion': 'entropy', 'max_depth': 50}
Current model = KNeighborsClassifier(), Best score for this model = 0.966, Parameters = {'n_neighbors': 1}

Best index = 3, Best score = 0.966, Best classifier = GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 10, 20, 50]},
             scoring='f1'), Parameters = {'n_neighbors': 1}


In [13]:
y_hat = best_clf.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_hat):.3}, Precision: {precision_score(y_test, y_hat):.3}, Recall: {recall_score(y_test, y_hat):.3}, AUC: {roc_auc_score(y_test, best_clf.predict_proba(X_test)[:, 1]):.3}, F1: {f1_score(y_test, y_hat):.3}')

Accuracy: 0.577, Precision: 0.769, Recall: 0.22, AUC: 0.577, F1: 0.343


In [14]:
feature_importance = permutation_importance(best_clf, X_test, y_test, scoring='f1', random_state=0)
clf_feature = pd.DataFrame({'Feature':feature.columns,'Importance':feature_importance.importances_mean})
clf_feature = clf_feature.sort_values(by='Importance',ascending=False)
clf_feature

Unnamed: 0,Feature,Importance
13,KidneyDisease,0.073142
7,Sex,0.066803
1,Smoking,0.045879
8,AgeCategory,0.031112
9,Diabetic,0.014958
3,Stroke,0.014901
5,MentalHealth,0.010837
6,DiffWalking,-0.000528
11,SleepTime,-0.001706
0,BMI,-0.001852
