In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings("ignore")

In [5]:
df = pd.read_csv("heart.csv")
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [9]:
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [11]:
df.duplicated().sum()

1

In [13]:
df.drop_duplicates(inplace= True)

In [15]:
df[162:168]

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
162,41,1,1,120,157,0,1,182,0,0.0,2,0,2,1
163,38,1,2,138,175,0,1,173,0,0.0,2,4,2,1
165,67,1,0,160,286,0,0,108,1,1.5,1,3,2,0
166,67,1,0,120,229,0,0,129,1,2.6,1,2,3,0
167,62,0,0,140,268,0,0,160,0,3.6,0,2,2,0
168,63,1,0,130,254,0,0,147,0,1.4,1,1,3,0


In [17]:
df.reset_index(drop=True,inplace=True)
df.duplicated().sum()

0

In [19]:
df[['cp', 'thal', 'restecg', 'slope', 'ca']] = df[['cp', 'thal', 'restecg', 'slope', 'ca']].astype(object)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 302 entries, 0 to 301
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       302 non-null    int64  
 1   sex       302 non-null    int64  
 2   cp        302 non-null    object 
 3   trestbps  302 non-null    int64  
 4   chol      302 non-null    int64  
 5   fbs       302 non-null    int64  
 6   restecg   302 non-null    object 
 7   thalach   302 non-null    int64  
 8   exang     302 non-null    int64  
 9   oldpeak   302 non-null    float64
 10  slope     302 non-null    object 
 11  ca        302 non-null    object 
 12  thal      302 non-null    object 
 13  target    302 non-null    int64  
dtypes: float64(1), int64(8), object(5)
memory usage: 33.2+ KB


In [21]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(handle_unknown = 'ignore')
col_list = ['cp', 'thal', 'restecg', 'slope', 'ca']

ohe.fit(df[col_list])
transformed = ohe.transform(df[col_list]).toarray()
transformed_df = pd.DataFrame(data = transformed, columns=ohe.get_feature_names_out(col_list))
transformed_df.head()

Unnamed: 0,cp_0,cp_1,cp_2,cp_3,thal_0,thal_1,thal_2,thal_3,restecg_0,restecg_1,restecg_2,slope_0,slope_1,slope_2,ca_0,ca_1,ca_2,ca_3,ca_4
0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 302 entries, 0 to 301
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       302 non-null    int64  
 1   sex       302 non-null    int64  
 2   cp        302 non-null    object 
 3   trestbps  302 non-null    int64  
 4   chol      302 non-null    int64  
 5   fbs       302 non-null    int64  
 6   restecg   302 non-null    object 
 7   thalach   302 non-null    int64  
 8   exang     302 non-null    int64  
 9   oldpeak   302 non-null    float64
 10  slope     302 non-null    object 
 11  ca        302 non-null    object 
 12  thal      302 non-null    object 
 13  target    302 non-null    int64  
dtypes: float64(1), int64(8), object(5)
memory usage: 33.2+ KB


In [29]:
df_dropped = df.drop(col_list,axis=1)
df_dropped.isnull().sum()

age         0
sex         0
trestbps    0
chol        0
fbs         0
thalach     0
exang       0
oldpeak     0
target      0
dtype: int64

In [31]:
df_final = pd.concat([df_dropped, transformed_df], axis=1)
df_final.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,target,cp_0,...,restecg_1,restecg_2,slope_0,slope_1,slope_2,ca_0,ca_1,ca_2,ca_3,ca_4
0,63,1,145,233,1,150,0,2.3,1,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,37,1,130,250,0,187,0,3.5,1,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,41,0,130,204,0,172,0,1.4,1,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
3,56,1,120,236,0,178,0,0.8,1,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
4,57,0,120,354,0,163,1,0.6,1,1.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0


In [33]:
df_final.duplicated().sum()

0

In [35]:
X = df_final.drop('target', axis=1)
y = df_final['target']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.2, random_state= 101, shuffle= True, stratify=y)

In [37]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((241, 27), (61, 27), (241,), (61,))

In [39]:
from sklearn.preprocessing import StandardScaler
ss= StandardScaler()
ss.fit(X_train)
X_train_scaled = ss.transform(X_train)
X_test_scaled = ss.transform(X_test)

In [41]:
#Evaluation
models_list = ["Logistic Regression", "KNN"]
accuracy_list, cv_mean_list, precision_list, recall_list, aucScore_list, f1_score_list = [], [], [], [], [], []

In [43]:
#Logistic Reg
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=10000, random_state=42)
model_LR = model.fit(X_train_scaled, y_train)
y_pred_log_reg = model_LR.predict( X_test_scaled)

In [45]:
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, f1_score, recall_score, precision_score
cm_log_reg = confusion_matrix(y_pred=y_pred_log_reg, y_true= y_test)

print("Confusion matrix:\n", cm_log_reg)

accuarcy_log_reg = accuracy_score(y_test, y_pred_log_reg)
precision_log_reg = precision_score(y_test, y_pred_log_reg)
recall_log_reg = recall_score(y_test, y_pred_log_reg)
f1_score_log_reg = f1_score(y_test, y_pred_log_reg)

accuracy_list.append(accuarcy_log_reg)
precision_list.append(precision_log_reg)
recall_list.append(recall_log_reg)
f1_score_list.append(f1_score_log_reg)


Confusion matrix:
 [[25  3]
 [ 6 27]]


In [47]:
print("\nAccuracy score:", accuarcy_log_reg,
      "\nPrecision score:", precision_log_reg,
      "\nRecall score:", recall_log_reg,
      "\nF1 score:",f1_score_log_reg)


Accuracy score: 0.8524590163934426 
Precision score: 0.9 
Recall score: 0.8181818181818182 
F1 score: 0.8571428571428571


In [49]:
from sklearn.metrics import  classification_report
print(cm_log_reg)
print(classification_report(y_test, y_pred_log_reg))

[[25  3]
 [ 6 27]]
              precision    recall  f1-score   support

           0       0.81      0.89      0.85        28
           1       0.90      0.82      0.86        33

    accuracy                           0.85        61
   macro avg       0.85      0.86      0.85        61
weighted avg       0.86      0.85      0.85        61



In [51]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=model_LR, X= X_train_scaled, y=y_train, cv=4, n_jobs=-1, verbose=2)
mean = accuracies.mean()
cv_mean_list.append(mean)
print("Cross val score mean:", mean)
print("Standart deviation of cross validation:", accuracies.std())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Cross val score mean: 0.846516393442623
Standart deviation of cross validation: 0.0132399181579302


[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    7.1s finished


In [52]:
accuracies

array([0.83606557, 0.85      , 0.83333333, 0.86666667])

In [55]:
df["target"].value_counts()

target
1    164
0    138
Name: count, dtype: int64

In [57]:
#KNN
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=7, n_jobs=-1, metric = 'euclidean')
model_KNN = model.fit(X_train_scaled, y_train)
y_pred_KNN = model_KNN.predict(X_test_scaled)

In [59]:
cm_KNN = confusion_matrix(y_pred=y_pred_KNN, y_true = y_test)
print("Confisuion matrix KNN:\n", cm_KNN)

accuracy_KNN = accuracy_score(y_test, y_pred_KNN)
precision_KNN = precision_score(y_test, y_pred_KNN)
recall_KNN = recall_score(y_test, y_pred_KNN)
f1_score_KNN = f1_score(y_test, y_pred_KNN)

accuracy_list.append(accuracy_KNN)
precision_list.append(precision_KNN)
recall_list.append(recall_KNN)
f1_score_list.append(f1_score_KNN)

print("\nAccuracy score:", accuracy_KNN,
      "\nPrecision score:", precision_KNN,
      "\nRecall score:", recall_KNN,
      "\nF1 score:",f1_score_KNN)

Confisuion matrix KNN:
 [[23  5]
 [ 4 29]]

Accuracy score: 0.8524590163934426 
Precision score: 0.8529411764705882 
Recall score: 0.8787878787878788 
F1 score: 0.8656716417910447


In [61]:
accuracies = cross_val_score(estimator=model_KNN, X=X_train_scaled, y=y_train, cv=4, n_jobs=-1, verbose=2)
mean= accuracies.mean()
cv_mean_list.append(mean)
print(" Mean cross validation:", mean)
print(" Standart dev", accuracies.std())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


 Mean cross validation: 0.8214480874316938
 Standart dev 0.019153473174679535


[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    0.8s finished


In [63]:
accuracies

array([0.85245902, 0.81666667, 0.81666667, 0.8       ])

In [65]:
#df.sort_values(by=['accuracy', 'precision', 'recall', 'f1_score'], ascending=False).iloc[[0]]

In [67]:
print(models_list)

['Logistic Regression', 'KNN']


In [69]:
print(accuracy_list)

[0.8524590163934426, 0.8524590163934426]


In [71]:
print(precision_list)

[0.9, 0.8529411764705882]


In [73]:
print(recall_list)

[0.8181818181818182, 0.8787878787878788]


In [75]:
print(f1_score_list)

[0.8571428571428571, 0.8656716417910447]


In [77]:
print(cv_mean_list)

[0.846516393442623, 0.8214480874316938]


In [79]:
data = pd.DataFrame ( {
    'Model' : models_list,
    'Accuracy' : accuracy_list,
    'Precision': precision_list,
    ' Recall' : recall_list,
     'F1_score': f1_score_list,
     'CV mean' : cv_mean_list })
data

Unnamed: 0,Model,Accuracy,Precision,Recall,F1_score,CV mean
0,Logistic Regression,0.852459,0.9,0.818182,0.857143,0.846516
1,KNN,0.852459,0.852941,0.878788,0.865672,0.821448
