In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [10]:
df = pd.read_csv("../dataset/heart.csv")

In [11]:
dt = DecisionTreeClassifier(random_state=42)
knn = KNeighborsClassifier()

In [19]:
encoders = {}

for column in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    encoders[column] = le

In [20]:
# Select columns with numerical data
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns

# Initialize scaler
scaler = StandardScaler()  # or MinMaxScaler()

# Fit and transform the numerical columns
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

In [21]:

X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

In [30]:
print(y_train.unique())
print(y_train.dtype)

[-1.11311472  0.89837999]
float64


In [31]:
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [35]:
y_train = y_train.map({-1: 0, 0: 1})
y_test = y_test.map({-1: 0, 0: 1})

In [36]:
# Single models
dt = DecisionTreeClassifier(random_state=42)
knn = KNeighborsClassifier()

In [43]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob) if y_prob is not None else None
    
    return {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'ROC-AUC': roc_auc
    }

# Train and evaluate all models
models = {
    'Single Decision Tree': dt,
    'Single KNN': knn
}

results = {}
for name, model in models.items():
    results[name] = evaluate_model(model, X_train, X_test, y_train, y_test)

# Convert results to DataFrame for better visualization
results_df = pd.DataFrame(results).T
print(results_df)

                      Accuracy  Precision    Recall  F1-Score   ROC-AUC
Single Decision Tree  0.788043   0.861702  0.757009   0.80597  0.794089
Single KNN            0.858696   0.893204  0.859813   0.87619  0.913642


In [44]:
y_pred = knn.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[66 11]
 [15 92]]
              precision    recall  f1-score   support

           0       0.81      0.86      0.84        77
           1       0.89      0.86      0.88       107

    accuracy                           0.86       184
   macro avg       0.85      0.86      0.86       184
weighted avg       0.86      0.86      0.86       184

