In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder



Data exploration

In [60]:

df = pd.read_csv('heart.csv')
df = pd.DataFrame(df)
df.head()



Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB


In [62]:
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.434146,0.69561,0.942439,131.611707,246.0,0.149268,0.529756,149.114146,0.336585,1.071512,1.385366,0.754146,2.323902,0.513171
std,9.07229,0.460373,1.029641,17.516718,51.59251,0.356527,0.527878,23.005724,0.472772,1.175053,0.617755,1.030798,0.62066,0.50007
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,275.0,0.0,1.0,166.0,1.0,1.8,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [63]:
df.dtypes

Unnamed: 0,0
age,int64
sex,int64
cp,int64
trestbps,int64
chol,int64
fbs,int64
restecg,int64
thalach,int64
exang,int64
oldpeak,float64


In [64]:
df.isnull().sum()

Unnamed: 0,0
age,0
sex,0
cp,0
trestbps,0
chol,0
fbs,0
restecg,0
thalach,0
exang,0
oldpeak,0


In [66]:
# Handle categorical features (if any)
categorical_features = df.select_dtypes(include=['object']).columns
if len(categorical_features) > 0:
    print("\nCategorical Features:", categorical_features)
    df = pd.get_dummies(df, columns=categorical_features, drop_first=True)    #onehot encoding



In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB


In [52]:
# Separate features and target variable
X = df.drop('target', axis=1)
y = df['target']



In [53]:
# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)



In [54]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)



In [55]:
# Initialize the classifiers
clf1 = DecisionTreeClassifier(random_state=42)
clf2 = LogisticRegression(random_state=42, solver='liblinear')
clf3 = KNeighborsClassifier(n_neighbors=5)

# Create the ensemble classifier
eclf1 = VotingClassifier(estimators=[('dt', clf1), ('lr', clf2), ('knn', clf3)], voting='hard')
eclf1 = eclf1.fit(X_train, y_train)
y_pred_ensemble = eclf1.predict(X_test)

# Evaluate the ensemble model
accuracy_ensemble = accuracy_score(y_test, y_pred_ensemble)
print(f'\nEnsemble Model Accuracy: {accuracy_ensemble:.4f}')
print("\nEnsemble Model Classification Report:")
print(classification_report(y_test, y_pred_ensemble))



Ensemble Model Accuracy: 0.9091

Ensemble Model Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.90      0.91       159
           1       0.90      0.92      0.91       149

    accuracy                           0.91       308
   macro avg       0.91      0.91      0.91       308
weighted avg       0.91      0.91      0.91       308



In [56]:

# Evaluate individual models


# Decision Tree
clf1.fit(X_train,y_train)
y_pred_dt = clf1.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f'\nDecision Tree Accuracy: {accuracy_dt:.4f}')
print("\nDecision Tree Classification Report:")
print(classification_report(y_test, y_pred_dt))



Decision Tree Accuracy: 0.9708

Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       159
           1       1.00      0.94      0.97       149

    accuracy                           0.97       308
   macro avg       0.97      0.97      0.97       308
weighted avg       0.97      0.97      0.97       308



In [57]:

# Logistic Regression
clf2.fit(X_train,y_train)
y_pred_lr = clf2.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f'\nLogistic Regression Accuracy: {accuracy_lr:.4f}')
print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr))




Logistic Regression Accuracy: 0.8052

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.75      0.80       159
           1       0.76      0.87      0.81       149

    accuracy                           0.81       308
   macro avg       0.81      0.81      0.80       308
weighted avg       0.81      0.81      0.80       308



In [58]:
# KNN
clf3.fit(X_train,y_train)
y_pred_knn = clf3.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f'\nKNN Accuracy: {accuracy_knn:.4f}')
print("\nKNN Classification Report:")
print(classification_report(y_test, y_pred_knn))


KNN Accuracy: 0.8571

KNN Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.85      0.86       159
           1       0.84      0.87      0.85       149

    accuracy                           0.86       308
   macro avg       0.86      0.86      0.86       308
weighted avg       0.86      0.86      0.86       308

