In [149]:
import pandas as pd 

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import StratifiedKFold

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import classification_report, roc_auc_score

In [150]:
df = pd.read_csv("datasets/heart.csv")

In [151]:
df.head(10)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
5,39,M,NAP,120,339,0,Normal,170,N,0.0,Up,0
6,45,F,ATA,130,237,0,Normal,170,N,0.0,Up,0
7,54,M,ATA,110,208,0,Normal,142,N,0.0,Up,0
8,37,M,ASY,140,207,0,Normal,130,Y,1.5,Flat,1
9,48,F,ATA,120,284,0,Normal,120,N,0.0,Up,0


In [152]:
X = df.drop(["HeartDisease"], axis=1)
y = df["HeartDisease"]

# Data Preprocessing

## Null treatment

Removing missing value rows and columns from our dataset is not the best option as it can result in significant information loss.

Imputation the process of substituting the missing values of our dataset. We can do this by defining our own customised function or we can simply perform imputation by using built-in methods such as the SimpleImputer class provided by sklearn. In this case, as we do not have any missing data we will not be using this approch.

## Duplicate treatment

Again, we have no duplicate values as we saw in the EDA phase.

## Feature Scaling

### Normalization and Standarization

Normalization is a scaling technique in which values are shifted and rescaled so that they end up ranging between 0 and 1. It is also known as Min-Max scaling.

![title](imgs/normalization.png)

Standardization is another scaling technique where the values are centered around the mean with a unit standard deviation. This means that the mean of the attribute becomes zero and the resultant distribution has a unit standard deviation.

![title](imgs/standarization.png)

* Normalization is good to use when you know that the distribution of your data does not follow a Gaussian distribution. This can be useful in algorithms that do not assume any distribution of the data like K-Nearest Neighbors and Neural Networks.
* Standardization, on the other hand, can be helpful in cases where the data follows a Gaussian distribution. However, this does not have to be necessarily true. Also, unlike normalization, standardization does not have a bounding range. So, even if you have outliers in your data, they will not be affected by standardization.

### Robust Scaling

When working with outliers we can use Robust Scaling for scakling our data. This method removes the median and scales the data in the range between 1st Quartile and 3rd Quartile. i.e., in between 25th and 75th quantile range (Interquartile Range).

* The “with_centering” argument controls whether the value is centered to zero (median is subtracted) and defaults to True.
* The “with_scaling” argument controls whether the value is scaled to the IQR (standard deviation set to one) or not and defaults to True.
* The definition of the scaling range can be specified via the “quantile_range” argument. It takes a tuple of two integers between 0 and 100 and defaults to the percentile values of the IQR, specifically (25, 75). Changing this will change the definition of outliers and the scope of the scaling.

In [153]:
numericalCols = X.select_dtypes("number").columns
categoricalCols = X.select_dtypes("object").columns
XNum = X[numericalCols]
XCat = X.drop(numericalCols, axis = 1)

In [154]:
scaler = preprocessing.RobustScaler()
robustNum = scaler.fit_transform(XNum)
robustNum = pd.DataFrame(robustNum, columns = numericalCols)
robustNum.head(10)

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak
0,-1.076923,0.5,0.704,0.0,0.944444,-0.4
1,-0.384615,1.5,-0.458667,0.0,0.5,0.266667
2,-1.307692,0.0,0.64,0.0,-1.111111,-0.4
3,-0.461538,0.4,-0.096,0.0,-0.833333,0.6
4,0.0,1.0,-0.298667,0.0,-0.444444,-0.4
5,-1.153846,-0.5,1.237333,0.0,0.888889,-0.4
6,-0.692308,0.0,0.149333,0.0,0.888889,-0.4
7,0.0,-1.0,-0.16,0.0,0.111111,-0.4
8,-1.307692,0.5,-0.170667,0.0,-0.222222,0.6
9,-0.461538,-0.5,0.650667,0.0,-0.5,-0.4


In [155]:
scaler = preprocessing.StandardScaler()
standardNum = scaler.fit_transform(XNum)
standardNum = pd.DataFrame(standardNum, columns = numericalCols)
standardNum.head(10)

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak
0,-1.43314,0.410909,0.82507,-0.551341,1.382928,-0.832432
1,-0.478484,1.491752,-0.171961,-0.551341,0.754157,0.105664
2,-1.751359,-0.129513,0.770188,-0.551341,-1.525138,-0.832432
3,-0.584556,0.302825,0.13904,-0.551341,-1.132156,0.574711
4,0.051881,0.951331,-0.034755,-0.551341,-0.581981,-0.832432
5,-1.539213,-0.669935,1.282424,-0.551341,1.304332,-0.832432
6,-0.902775,-0.129513,0.349422,-0.551341,1.304332,-0.832432
7,0.051881,-1.210356,0.084157,-0.551341,0.203982,-0.832432
8,-1.751359,0.410909,0.07501,-0.551341,-0.267596,0.574711
9,-0.584556,-0.669935,0.779335,-0.551341,-0.660578,-0.832432


In [156]:
scaler = preprocessing.MinMaxScaler()
minMaxNum = scaler.fit_transform(XNum)
minMaxNum = pd.DataFrame(minMaxNum, columns = numericalCols)
minMaxNum.head(10)

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak
0,0.244898,0.7,0.47927,0.0,0.788732,0.295455
1,0.428571,0.8,0.298507,0.0,0.676056,0.409091
2,0.183673,0.65,0.46932,0.0,0.267606,0.295455
3,0.408163,0.69,0.354892,0.0,0.338028,0.465909
4,0.530612,0.75,0.323383,0.0,0.43662,0.295455
5,0.22449,0.6,0.562189,0.0,0.774648,0.295455
6,0.346939,0.65,0.393035,0.0,0.774648,0.295455
7,0.530612,0.55,0.344942,0.0,0.577465,0.295455
8,0.183673,0.7,0.343284,0.0,0.492958,0.465909
9,0.408163,0.6,0.470978,0.0,0.422535,0.295455


## Handling Categorical Variables

Categorical variables/features are any feature type can be classified into two major types:
* Nominal: No order associated (i.e. gender).
* Ordinal: They have "levels" or categories with a particular order associated.

We can use Label Encoding (which works best for not-tree based Machine Learning Algorithms) or Label Encoding (better for Tree based Machine Learning Algorithms).

In [157]:
dfLE = df[categoricalCols].apply(LabelEncoder().fit_transform)
dfLE.head()

Unnamed: 0,Sex,ChestPainType,RestingECG,ExerciseAngina,ST_Slope
0,1,1,1,0,2
1,0,2,1,0,1
2,1,1,2,0,2
3,0,0,1,1,1
4,1,2,1,0,2


In [158]:
dfOHE=pd.get_dummies(df[categoricalCols],columns=categoricalCols,drop_first=False)
dfOHE.head()

Unnamed: 0,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,0,1,0,1,0,0,0,1,0,1,0,0,0,1
1,1,0,0,0,1,0,0,1,0,1,0,0,1,0
2,0,1,0,1,0,0,0,0,1,1,0,0,0,1
3,1,0,1,0,0,0,0,1,0,0,1,0,1,0
4,0,1,0,0,1,0,0,1,0,1,0,0,0,1


In [159]:
dfRobust = pd.concat([robustNum,dfOHE], axis = 1)
dfMinMax = pd.concat([minMaxNum,dfOHE], axis = 1)
dfStandard = pd.concat([standardNum,dfOHE], axis = 1)

In [160]:
dfRobust.head(10)

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,-1.076923,0.5,0.704,0.0,0.944444,-0.4,0,1,0,1,0,0,0,1,0,1,0,0,0,1
1,-0.384615,1.5,-0.458667,0.0,0.5,0.266667,1,0,0,0,1,0,0,1,0,1,0,0,1,0
2,-1.307692,0.0,0.64,0.0,-1.111111,-0.4,0,1,0,1,0,0,0,0,1,1,0,0,0,1
3,-0.461538,0.4,-0.096,0.0,-0.833333,0.6,1,0,1,0,0,0,0,1,0,0,1,0,1,0
4,0.0,1.0,-0.298667,0.0,-0.444444,-0.4,0,1,0,0,1,0,0,1,0,1,0,0,0,1
5,-1.153846,-0.5,1.237333,0.0,0.888889,-0.4,0,1,0,0,1,0,0,1,0,1,0,0,0,1
6,-0.692308,0.0,0.149333,0.0,0.888889,-0.4,1,0,0,1,0,0,0,1,0,1,0,0,0,1
7,0.0,-1.0,-0.16,0.0,0.111111,-0.4,0,1,0,1,0,0,0,1,0,1,0,0,0,1
8,-1.307692,0.5,-0.170667,0.0,-0.222222,0.6,0,1,1,0,0,0,0,1,0,0,1,0,1,0
9,-0.461538,-0.5,0.650667,0.0,-0.5,-0.4,1,0,0,1,0,0,0,1,0,1,0,0,0,1


In [161]:
dfMinMax.head(10)

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,0.244898,0.7,0.47927,0.0,0.788732,0.295455,0,1,0,1,0,0,0,1,0,1,0,0,0,1
1,0.428571,0.8,0.298507,0.0,0.676056,0.409091,1,0,0,0,1,0,0,1,0,1,0,0,1,0
2,0.183673,0.65,0.46932,0.0,0.267606,0.295455,0,1,0,1,0,0,0,0,1,1,0,0,0,1
3,0.408163,0.69,0.354892,0.0,0.338028,0.465909,1,0,1,0,0,0,0,1,0,0,1,0,1,0
4,0.530612,0.75,0.323383,0.0,0.43662,0.295455,0,1,0,0,1,0,0,1,0,1,0,0,0,1
5,0.22449,0.6,0.562189,0.0,0.774648,0.295455,0,1,0,0,1,0,0,1,0,1,0,0,0,1
6,0.346939,0.65,0.393035,0.0,0.774648,0.295455,1,0,0,1,0,0,0,1,0,1,0,0,0,1
7,0.530612,0.55,0.344942,0.0,0.577465,0.295455,0,1,0,1,0,0,0,1,0,1,0,0,0,1
8,0.183673,0.7,0.343284,0.0,0.492958,0.465909,0,1,1,0,0,0,0,1,0,0,1,0,1,0
9,0.408163,0.6,0.470978,0.0,0.422535,0.295455,1,0,0,1,0,0,0,1,0,1,0,0,0,1


In [162]:
dfStandard.head(10)

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,-1.43314,0.410909,0.82507,-0.551341,1.382928,-0.832432,0,1,0,1,0,0,0,1,0,1,0,0,0,1
1,-0.478484,1.491752,-0.171961,-0.551341,0.754157,0.105664,1,0,0,0,1,0,0,1,0,1,0,0,1,0
2,-1.751359,-0.129513,0.770188,-0.551341,-1.525138,-0.832432,0,1,0,1,0,0,0,0,1,1,0,0,0,1
3,-0.584556,0.302825,0.13904,-0.551341,-1.132156,0.574711,1,0,1,0,0,0,0,1,0,0,1,0,1,0
4,0.051881,0.951331,-0.034755,-0.551341,-0.581981,-0.832432,0,1,0,0,1,0,0,1,0,1,0,0,0,1
5,-1.539213,-0.669935,1.282424,-0.551341,1.304332,-0.832432,0,1,0,0,1,0,0,1,0,1,0,0,0,1
6,-0.902775,-0.129513,0.349422,-0.551341,1.304332,-0.832432,1,0,0,1,0,0,0,1,0,1,0,0,0,1
7,0.051881,-1.210356,0.084157,-0.551341,0.203982,-0.832432,0,1,0,1,0,0,0,1,0,1,0,0,0,1
8,-1.751359,0.410909,0.07501,-0.551341,-0.267596,0.574711,0,1,1,0,0,0,0,1,0,0,1,0,1,0
9,-0.584556,-0.669935,0.779335,-0.551341,-0.660578,-0.832432,1,0,0,1,0,0,0,1,0,1,0,0,0,1


In [163]:
colsOHE=dfRobust.columns.to_list()
colsOHE

['Age',
 'RestingBP',
 'Cholesterol',
 'FastingBS',
 'MaxHR',
 'Oldpeak',
 'Sex_F',
 'Sex_M',
 'ChestPainType_ASY',
 'ChestPainType_ATA',
 'ChestPainType_NAP',
 'ChestPainType_TA',
 'RestingECG_LVH',
 'RestingECG_Normal',
 'RestingECG_ST',
 'ExerciseAngina_N',
 'ExerciseAngina_Y',
 'ST_Slope_Down',
 'ST_Slope_Flat',
 'ST_Slope_Up']

In [164]:
dfRobust = pd.concat([dfRobust,df["HeartDisease"]], axis = 1)
dfMinMax = pd.concat([dfMinMax,df["HeartDisease"]], axis = 1)
dfStandard = pd.concat([dfStandard,df["HeartDisease"]], axis = 1)

# Logistic Regression

In [165]:
dfList = [dfRobust, dfMinMax, dfStandard]
dfListNonNeg = [dfStandard, dfMinMax]

In [166]:
kf=StratifiedKFold(n_splits=5)
for df in dfList:
    for fold , (trn_,val_) in enumerate(kf.split(X=df,y=y)):
        X_train=df.loc[trn_,colsOHE]
        y_train=df.loc[trn_,"HeartDisease"]
        
        X_valid=df.loc[val_,colsOHE]
        y_valid=df.loc[val_,"HeartDisease"]
        
        clf=LogisticRegression()
        clf.fit(X_train,y_train)
        y_pred=clf.predict(X_valid)
        print(classification_report(y_valid,y_pred))
        print(f"The AUC for Fold {fold} : {roc_auc_score(y_valid,y_pred)}")

              precision    recall  f1-score   support

           0       0.79      0.93      0.85        82
           1       0.93      0.80      0.86       102

    accuracy                           0.86       184
   macro avg       0.86      0.87      0.86       184
weighted avg       0.87      0.86      0.86       184

The AUC for Fold 0 : 0.865375418460067
              precision    recall  f1-score   support

           0       0.77      0.90      0.83        82
           1       0.91      0.78      0.84       102

    accuracy                           0.84       184
   macro avg       0.84      0.84      0.84       184
weighted avg       0.85      0.84      0.84       184

The AUC for Fold 1 : 0.8433763749402199
              precision    recall  f1-score   support

           0       0.96      0.63      0.76        82
           1       0.77      0.98      0.86       102

    accuracy                           0.83       184
   macro avg       0.87      0.81      0.81      

# Naive-Bayes

In [167]:
kf=StratifiedKFold(n_splits=5)
for df in dfList:
    for fold , (trn_,val_) in enumerate(kf.split(X=df,y=y)):
        X_train=df.loc[trn_,colsOHE]
        y_train=df.loc[trn_,"HeartDisease"]
        
        X_valid=df.loc[val_,colsOHE]
        y_valid=df.loc[val_,"HeartDisease"]
        
        clf=GaussianNB()
        clf.fit(X_train,y_train)
        y_pred=clf.predict(X_valid)
        print(classification_report(y_valid,y_pred))
        print(f"The AUC for Fold {fold} : {roc_auc_score(y_valid,y_pred)}")

              precision    recall  f1-score   support

           0       0.78      0.93      0.85        82
           1       0.93      0.79      0.86       102

    accuracy                           0.85       184
   macro avg       0.86      0.86      0.85       184
weighted avg       0.87      0.85      0.85       184

The AUC for Fold 0 : 0.8604734576757532
              precision    recall  f1-score   support

           0       0.83      0.91      0.87        82
           1       0.93      0.85      0.89       102

    accuracy                           0.88       184
   macro avg       0.88      0.88      0.88       184
weighted avg       0.88      0.88      0.88       184

The AUC for Fold 1 : 0.8837876614060258
              precision    recall  f1-score   support

           0       0.96      0.63      0.76        82
           1       0.77      0.98      0.86       102

    accuracy                           0.83       184
   macro avg       0.87      0.81      0.81     

# Multinomial Naive-Bayes

MinMax Scaler (Normalization) is the only one with no negative values. Therefore, we only use this dataframe.

In [168]:
for fold , (trn_,val_) in enumerate(kf.split(X=dfMinMax,y=y)):
    X_train=dfMinMax.loc[trn_,colsOHE]
    y_train=dfMinMax.loc[trn_,"HeartDisease"]
        
    X_valid=dfMinMax.loc[val_,colsOHE]
    y_valid=dfMinMax.loc[val_,"HeartDisease"]
        
    clf=MultinomialNB()
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_valid)
    print(classification_report(y_valid,y_pred))
    print(f"The AUC for Fold {fold} : {roc_auc_score(y_valid,y_pred)}")

              precision    recall  f1-score   support

           0       0.87      0.93      0.90        82
           1       0.94      0.89      0.91       102

    accuracy                           0.91       184
   macro avg       0.91      0.91      0.91       184
weighted avg       0.91      0.91      0.91       184

The AUC for Fold 0 : 0.9094930655188904
              precision    recall  f1-score   support

           0       0.78      0.90      0.84        82
           1       0.91      0.79      0.85       102

    accuracy                           0.84       184
   macro avg       0.84      0.85      0.84       184
weighted avg       0.85      0.84      0.84       184

The AUC for Fold 1 : 0.8482783357245336
              precision    recall  f1-score   support

           0       0.92      0.70      0.79        82
           1       0.80      0.95      0.87       102

    accuracy                           0.84       184
   macro avg       0.86      0.82      0.83     

# SVM

## Support Vector Machine with a Linear Kernel

In [169]:
kf=StratifiedKFold(n_splits=5)
for df in dfList:
    for fold , (trn_,val_) in enumerate(kf.split(X=df,y=y)):

        X_train=df.loc[trn_,colsOHE]
        y_train=df.loc[trn_,"HeartDisease"]
        
        X_valid=df.loc[val_,colsOHE]
        y_valid=df.loc[val_,"HeartDisease"]
        
        clf=SVC(kernel="linear")
        clf.fit(X_train,y_train)
        y_pred=clf.predict(X_valid)
        print(classification_report(y_valid,y_pred))
        print(f"The AUC for Fold {fold} : {roc_auc_score(y_valid,y_pred)}")

              precision    recall  f1-score   support

           0       0.81      0.91      0.86        82
           1       0.92      0.82      0.87       102

    accuracy                           0.86       184
   macro avg       0.86      0.87      0.86       184
weighted avg       0.87      0.86      0.86       184

The AUC for Fold 0 : 0.8690817790530847
              precision    recall  f1-score   support

           0       0.77      0.91      0.84        82
           1       0.92      0.78      0.85       102

    accuracy                           0.84       184
   macro avg       0.85      0.85      0.84       184
weighted avg       0.85      0.84      0.84       184

The AUC for Fold 1 : 0.8494739359158296
              precision    recall  f1-score   support

           0       0.96      0.67      0.79        82
           1       0.79      0.98      0.87       102

    accuracy                           0.84       184
   macro avg       0.88      0.83      0.83     

## Support Vector Machine with a Sigmoid Kernel

In [170]:
kf=StratifiedKFold(n_splits=5)
for df in dfList:
    for fold , (trn_,val_) in enumerate(kf.split(X=df,y=y)):

        X_train=df.loc[trn_,colsOHE]
        y_train=df.loc[trn_,"HeartDisease"]
        
        X_valid=df.loc[val_,colsOHE]
        y_valid=df.loc[val_,"HeartDisease"]
        
        clf=SVC(kernel="sigmoid")
        clf.fit(X_train,y_train)
        y_pred=clf.predict(X_valid)
        print(classification_report(y_valid,y_pred))
        print(f"The AUC for Fold {fold} : {roc_auc_score(y_valid,y_pred)}")

              precision    recall  f1-score   support

           0       0.70      0.83      0.76        82
           1       0.84      0.72      0.77       102

    accuracy                           0.77       184
   macro avg       0.77      0.77      0.77       184
weighted avg       0.78      0.77      0.77       184

The AUC for Fold 0 : 0.7724772835963655
              precision    recall  f1-score   support

           0       0.54      0.88      0.67        82
           1       0.80      0.40      0.54       102

    accuracy                           0.61       184
   macro avg       0.67      0.64      0.60       184
weighted avg       0.69      0.61      0.60       184

The AUC for Fold 1 : 0.6400047824007652
              precision    recall  f1-score   support

           0       0.79      0.55      0.65        82
           1       0.71      0.88      0.79       102

    accuracy                           0.73       184
   macro avg       0.75      0.72      0.72     

## Support Vector Machine with a RBF Kernel

In [171]:
kf=StratifiedKFold(n_splits=5)
for df in dfList:
    for fold , (trn_,val_) in enumerate(kf.split(X=df,y=y)):

        X_train=df.loc[trn_,colsOHE]
        y_train=df.loc[trn_,"HeartDisease"]
        
        X_valid=df.loc[val_,colsOHE]
        y_valid=df.loc[val_,"HeartDisease"]
        
        clf=SVC(kernel="rbf")
        clf.fit(X_train,y_train)
        y_pred=clf.predict(X_valid)
        print(classification_report(y_valid,y_pred))
        print(f"The AUC for Fold {fold} : {roc_auc_score(y_valid,y_pred)}")

              precision    recall  f1-score   support

           0       0.78      0.93      0.85        82
           1       0.93      0.79      0.86       102

    accuracy                           0.85       184
   macro avg       0.86      0.86      0.85       184
weighted avg       0.87      0.85      0.85       184

The AUC for Fold 0 : 0.8604734576757532
              precision    recall  f1-score   support

           0       0.81      0.90      0.86        82
           1       0.91      0.83      0.87       102

    accuracy                           0.86       184
   macro avg       0.86      0.87      0.86       184
weighted avg       0.87      0.86      0.86       184

The AUC for Fold 1 : 0.8678861788617888
              precision    recall  f1-score   support

           0       0.98      0.63      0.77        82
           1       0.77      0.99      0.87       102

    accuracy                           0.83       184
   macro avg       0.88      0.81      0.82     

# KNN

K-nearest neighbors (k-NN) is a pattern recognition algorithm that uses training datasets to find the k closest relatives in future examples. When k-NN is used in classification, you calculate to place data within the category of its nearest neighbor.

The optimal K value usually found is the square root of N, where N is the total number of samples

In [172]:
kf=StratifiedKFold(n_splits=5)
for df in dfList:
    for fold , (trn_,val_) in enumerate(kf.split(X=df,y=y)):

        X_train=df.loc[trn_,colsOHE]
        y_train=df.loc[trn_,"HeartDisease"]
        
        X_valid=df.loc[val_,colsOHE]
        y_valid=df.loc[val_,"HeartDisease"]
        
        clf=KNeighborsClassifier(n_neighbors=32)
        clf.fit(X_train,y_train)
        y_pred=clf.predict(X_valid)
        print(classification_report(y_valid,y_pred))
        print(f"The AUC for Fold {fold} : {roc_auc_score(y_valid,y_pred)}")

              precision    recall  f1-score   support

           0       0.74      0.91      0.82        82
           1       0.91      0.74      0.82       102

    accuracy                           0.82       184
   macro avg       0.82      0.82      0.82       184
weighted avg       0.83      0.82      0.82       184

The AUC for Fold 0 : 0.8249641319942611
              precision    recall  f1-score   support

           0       0.86      0.90      0.88        82
           1       0.92      0.88      0.90       102

    accuracy                           0.89       184
   macro avg       0.89      0.89      0.89       184
weighted avg       0.89      0.89      0.89       184

The AUC for Fold 1 : 0.8923959827833573
              precision    recall  f1-score   support

           0       0.98      0.67      0.80        82
           1       0.79      0.99      0.88       102

    accuracy                           0.85       184
   macro avg       0.89      0.83      0.84     