In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.inspection import permutation_importance
from sklearn.model_selection import cross_val_score
from sklearn.metrics import (confusion_matrix, precision_score, recall_score,
                             accuracy_score, roc_auc_score, RocCurveDisplay)
from imblearn.over_sampling import RandomOverSampler

In [34]:
df = pd.read_csv('heart_disease.csv')
df = df.replace({'Yes': 1, 'No': 0})
df = df.replace({'Yes (during pregnancy)':'1'})
df = df.replace({'No, borderline diabetes':'0'})
df = df.replace({'Female': 1, 'Male': 0})
df = df.drop(columns=['AgeCategory','Race','GenHealth'])
df.head()
df = df.sample(frac=0.01, random_state=0)
df.head()

  df = df.replace({'Yes': 1, 'No': 0})
  df = df.replace({'Female': 1, 'Male': 0})


Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,Diabetic,PhysicalActivity,SleepTime,Asthma,KidneyDisease,SkinCancer
126167,0,23.44,0,0,0,10.0,20.0,1,1,0,0,6.0,0,0,0
207506,0,32.49,0,0,0,0.0,4.0,0,0,0,1,8.0,0,0,0
274544,0,21.93,0,0,0,0.0,0.0,0,0,0,1,7.0,0,0,0
121049,0,26.58,0,0,0,0.0,2.0,0,0,0,1,7.0,0,0,0
260961,0,19.02,1,0,0,2.0,2.0,0,1,0,1,6.0,0,0,0


In [35]:
feature = df.drop(columns=['HeartDisease'])
X = feature.values
y = df['HeartDisease'] 

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_tv, y_train, y_tv = train_test_split(X_scaled, y, test_size=.2, random_state=0, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_tv, y_tv, test_size=.5, random_state=0, stratify=y_tv)

ros = RandomOverSampler(random_state=0)
X_train, y_train = ros.fit_resample(X_train, y_train)
X_val, y_val = ros.fit_resample(X_train, y_train)

In [None]:
models = [svm.SVC(), linear_model.LogisticRegression(), DecisionTreeClassifier(), KNeighborsClassifier()]
hyperpas = [{'kernel':['linear','rbf'], 'C':[0.01, 1, 100]},
            {'max_iter':[5000, 10000], 'C':[0.01, 1, 100]},
            {'criterion':['entropy'], 'max_depth':[1, 3, 5, 7]},
            {'n_neighbors':[1, 2, 3, 4]}, {'random_state':[0]}]
best_index = -1
best_score = -np.inf
best_clf = None
for i in range(4):
    curr_model = models[i]
    curr_clf = GridSearchCV(curr_model, hyperpas[i])
    curr_clf.fit(X_train, y_train)
    curr_score = curr_clf.score(X_val, y_val)
    if curr_score > best_score:
        best_index = i
        best_score = curr_score
        best_clf = curr_clf
    print(f'Current model = {curr_model}, Best score for this model = {curr_score:.3}, Parameters = {curr_clf.best_params_}')
print(f'\nBest index = {best_index}, Best score = {best_score:.3}, Best classifier = {best_clf}, Parameters = {best_clf.best_params_}')

In [14]:
# Seprecated
lr = linear_model.LogisticRegression(max_iter=10000,random_state=0)
lr.fit(X_train, y_train)
lr_feature = pd.DataFrame({'Feature':feature.columns,'Coefficient':lr.coef_[0]})
lr_feature = lr_feature.sort_values(by='Coefficient',ascending=False)
print(lr_feature)

             Feature  Coefficient
8           Diabetic     0.409905
6        DiffWalking     0.345569
3             Stroke     0.261027
4     PhysicalHealth     0.214323
12     KidneyDisease     0.205339
13        SkinCancer     0.195566
1            Smoking     0.130738
11            Asthma     0.058507
10         SleepTime    -0.001898
0                BMI    -0.045078
9   PhysicalActivity    -0.084857
2    AlcoholDrinking    -0.167091
5       MentalHealth    -0.175957
7                Sex    -0.313917


In [18]:
# Deprecated
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
feature_importance = permutation_importance(knn, X_test, y_test, scoring='accuracy', random_state=0)
knn_feature = pd.DataFrame({'Feature':feature.columns,'Importance':feature_importance.importances_mean})
knn_feature = knn_feature.sort_values(by='Importance',ascending=False)
print(knn_feature)

             Feature    Importance
12     KidneyDisease  3.000000e-03
11            Asthma  2.000000e-03
10         SleepTime  1.500000e-03
5       MentalHealth -4.440892e-17
8           Diabetic -2.500000e-04
4     PhysicalHealth -5.000000e-04
3             Stroke -1.250000e-03
1            Smoking -1.250000e-03
2    AlcoholDrinking -1.250000e-03
0                BMI -2.000000e-03
7                Sex -2.250000e-03
6        DiffWalking -3.000000e-03
9   PhysicalActivity -4.000000e-03
13        SkinCancer -4.000000e-03


In [9]:
# Deprecated
tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train, y_train)
feature_importance = pd.DataFrame({'Feature':feature.columns,'Importance':tree.feature_importances_})
tree_feature = feature_importance.sort_values(by='Importance',ascending=False)
print(tree_feature)

             Feature  Importance
0                BMI    0.391280
10         SleepTime    0.127673
4     PhysicalHealth    0.090477
1            Smoking    0.063330
5       MentalHealth    0.061532
6        DiffWalking    0.053557
9   PhysicalActivity    0.040443
13        SkinCancer    0.038492
8           Diabetic    0.034010
12     KidneyDisease    0.026572
7                Sex    0.024747
3             Stroke    0.022591
11            Asthma    0.013538
2    AlcoholDrinking    0.011759


In [10]:
# Deprecated
svm_linear = svm.SVC(kernel='linear', random_state=0)
svm_linear.fit(X_train, y_train)
svm_feature = pd.DataFrame({'Feature':feature.columns,'Coefficient':svm_linear.coef_[0]})
svm_feature = svm_feature.sort_values(by='Coefficient',ascending=False)
print(svm_feature)

             Feature   Coefficient
8           Diabetic  4.809777e-05
12     KidneyDisease  4.793828e-05
1            Smoking  3.171471e-05
6        DiffWalking  9.289793e-06
7                Sex  8.539540e-06
11            Asthma  8.706122e-07
10         SleepTime -4.298855e-06
13        SkinCancer -4.869591e-06
4     PhysicalHealth -9.804496e-06
3             Stroke -1.499507e-05
0                BMI -1.885034e-05
5       MentalHealth -3.387512e-05
9   PhysicalActivity -4.368841e-05
2    AlcoholDrinking -5.142829e-05


In [17]:
# Deprecated
svm_rbf = svm.SVC(kernel='rbf', random_state=0)
svm_rbf.fit(X_train, y_train)
feature_importance = permutation_importance(svm_rbf, X_test, y_test, scoring='accuracy', random_state=0)
rbf_feature = pd.DataFrame({'Feature':feature.columns,'Importance':feature_importance.importances_mean})
rbf_feature = rbf_feature.sort_values(by='Importance',ascending=False)
print(rbf_feature)

             Feature  Importance
8           Diabetic     0.00075
9   PhysicalActivity     0.00075
10         SleepTime     0.00050
11            Asthma     0.00050
0                BMI     0.00050
1            Smoking     0.00050
12     KidneyDisease     0.00025
5       MentalHealth     0.00000
4     PhysicalHealth     0.00000
2    AlcoholDrinking     0.00000
6        DiffWalking     0.00000
7                Sex     0.00000
13        SkinCancer    -0.00050
3             Stroke    -0.00150


In [16]:
# Deprecated
classifiers = {
    'Logistic Regression': lr,
    'kNN': knn,
    'Decision Tree': tree,
    'SVM (Linear)': svm_linear,
    'SVM (RBF)': svm_rbf
}
for clf_name, clf in classifiers.items():
    print(f'{clf_name}:')
    print(f'Training accuracy is {clf.score(X_train, y_train):.3}.')
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
    with np.printoptions(precision=2):
        print(f'Cross-validation scores={scores}, mean={np.mean(scores):.3}')
    print(f'Test accuracy is {clf.score(X_test, y_test):.3}.\n')

NameError: name 'knn' is not defined

Current model = SVC(), Best score for this model = 0.922, Parameters = {'C': 0.01, 'kernel': 'linear'}
Current model = LogisticRegression(), Best score for this model = 0.928, Parameters = {'C': 0.01, 'max_iter': 5000}
Current model = DecisionTreeClassifier(), Best score for this model = 0.922, Parameters = {'criterion': 'entropy', 'max_depth': 3}
Current model = KNeighborsClassifier(), Best score for this model = 0.919, Parameters = {'n_neighbors': 4}

Best index = 1, Best score = 0.928, Best classifier = GridSearchCV(estimator=LogisticRegression(),
             param_grid={'C': [0.01, 1, 100], 'max_iter': [5000, 10000]}), Parameters = {'C': 0.01, 'max_iter': 5000}


In [38]:
np.unique(y_test, return_counts=True)

(array([0, 1]), array([295,  25]))

In [31]:
clf = best_clf
y_hat = clf.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_hat):.3}, Precision: {precision_score(y_test, y_hat):.3}, Recall: {recall_score(y_test, y_hat):.3}, AUC: {roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]):.3}')

Accuracy: 0.931, Precision: 0.8, Recall: 0.16, AUC: 0.841
