In [45]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [46]:
df = pd.read_csv('data/heart.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [47]:
print(df.isnull().sum())
print('\nCount of null values', df.duplicated().sum())
print('\nData type of features\n', df.dtypes)

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

Count of null values 0

Data type of features
 Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina     object
Oldpeak           float64
ST_Slope           object
HeartDisease        int64
dtype: object


In [48]:
le = LabelEncoder()
print('Sex values',df.Sex.unique())
df.Sex = le.fit_transform(df['Sex'])
print('ChestPainType values',df['ChestPainType'].unique())
df.ChestPainType = le.fit_transform(df['ChestPainType'])
print('RestingECG values',df['RestingECG'].unique())
df.RestingECG = le.fit_transform(df['RestingECG'])
print('ExerciseAngina values',df['ExerciseAngina'].unique())
df.ExerciseAngina = le.fit_transform(df['ExerciseAngina'])
print('ST_Slope values',df['ST_Slope'].unique())
df.ST_Slope = le.fit_transform(df['ST_Slope'])
print('\nData type of features\n', df.dtypes)

Sex values ['M' 'F']
ChestPainType values ['ATA' 'NAP' 'ASY' 'TA']
RestingECG values ['Normal' 'ST' 'LVH']
ExerciseAngina values ['N' 'Y']
ST_Slope values ['Up' 'Flat' 'Down']

Data type of features
 Age                 int64
Sex                 int32
ChestPainType       int32
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG          int32
MaxHR               int64
ExerciseAngina      int32
Oldpeak           float64
ST_Slope            int32
HeartDisease        int64
dtype: object


In [49]:
X = df.drop(columns='HeartDisease')
y = df['HeartDisease']
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42,test_size=.3)
sc = StandardScaler()
X_train_ = sc.fit_transform(X_train)
X_test_ = sc.transform(X_test)

In [50]:
lrpram = {
    'C' : [0.01,0.1,1,10],
    'penalty': ['l2'],
    'solver':['lbfgs','liblinear'],
    'max_iter': [1000]
}

lr = GridSearchCV(LogisticRegression(), lrpram, cv = 5, scoring='accuracy')
lr.fit(X_train_, y_train)
lrpred = lr.best_estimator_.predict(X_test_)
print('Logistic Regression best paramater', lr.best_params_)
print('Logistic Regression best score', lr.best_score_)
print('\nclassificatin report:\n', classification_report(y_test,lrpred))

Logistic Regression best paramater {'C': 1, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'liblinear'}
Logistic Regression best score 0.8411337209302324

classificatin report:
               precision    recall  f1-score   support

           0       0.82      0.89      0.85       112
           1       0.92      0.87      0.89       164

    accuracy                           0.88       276
   macro avg       0.87      0.88      0.87       276
weighted avg       0.88      0.88      0.88       276



In [51]:
knnpram = {
    'n_neighbors':list(range(3,21)),
    'weights':['uniform','distance'],
    'metric':['euclidean', 'manhattan']
}
knn = GridSearchCV(KNeighborsClassifier(),knnpram,cv=5,scoring='accuracy')
knn.fit(X_train_,y_train)
knnpred = knn.best_estimator_.predict(X_test_)
print('KNN best paramater', knn.best_params_)
print('KNN best score', knn.best_score_)
print('\nconfusion matrix:\n', confusion_matrix(y_test,knnpred))
print('\nclassificatin report:\n', classification_report(y_test,knnpred))

KNN best paramater {'metric': 'manhattan', 'n_neighbors': 10, 'weights': 'distance'}
KNN best score 0.8707243217054262

confusion matrix:
 [[101  11]
 [ 15 149]]

classificatin report:
               precision    recall  f1-score   support

           0       0.87      0.90      0.89       112
           1       0.93      0.91      0.92       164

    accuracy                           0.91       276
   macro avg       0.90      0.91      0.90       276
weighted avg       0.91      0.91      0.91       276



In [52]:
nb = GaussianNB()
nb.fit(X_train_,y_train)
nbpred = nb.predict(X_test_)
print('NB score', accuracy_score(y_test, nbpred))
print('\nconfusion matrix:\n', confusion_matrix(y_test,nbpred))
print('\nclassificatin report:\n', classification_report(y_test,nbpred))

NB score 0.8731884057971014

confusion matrix:
 [[ 98  14]
 [ 21 143]]

classificatin report:
               precision    recall  f1-score   support

           0       0.82      0.88      0.85       112
           1       0.91      0.87      0.89       164

    accuracy                           0.87       276
   macro avg       0.87      0.87      0.87       276
weighted avg       0.88      0.87      0.87       276



In [56]:
rfpram = {
    'n_estimators':[100,200],
    'max_depth': [4, 6, 8, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
rf = GridSearchCV(RandomForestClassifier(random_state=42),rfpram,cv=5,scoring='accuracy')
rf.fit(X_train,y_train)
rfpred = rf.best_estimator_.predict(X_test)
print('RF best paramater', rf.best_params_)
print('RF best score', rf.best_score_)
print('\nconfusion matrix:\n', confusion_matrix(y_test,rfpred))
print('\nclassificatin report:\n', classification_report(y_test,rfpred))

RF best paramater {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
RF best score 0.8597504844961239

confusion matrix:
 [[ 97  15]
 [ 18 146]]

classificatin report:
               precision    recall  f1-score   support

           0       0.84      0.87      0.85       112
           1       0.91      0.89      0.90       164

    accuracy                           0.88       276
   macro avg       0.88      0.88      0.88       276
weighted avg       0.88      0.88      0.88       276



In [62]:
from sklearn.metrics import roc_auc_score

print('Logistic Regression AUC:', roc_auc_score(y_test, lr.best_estimator_.predict_proba(X_test_)[:,1]))
print('KNN AUC:', roc_auc_score(y_test, knn.best_estimator_.predict_proba(X_test_)[:,1]))
print('Naive Bayes AUC:', roc_auc_score(y_test, nb.predict_proba(X_test_)[:,1]))
print('Random Forest AUC:', roc_auc_score(y_test, rf.best_estimator_.predict_proba(X_test)[:,1]))


Logistic Regression AUC: 0.9280814459930313
KNN AUC: 0.9492051393728224
Naive Bayes AUC: 0.9329812717770035
Random Forest AUC: 0.946945775261324
