In [1]:
#import libraries
import pandas as pd
import numpy as np
import random

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, classification_report


In [None]:
#load an prepare data

# Load the dataset
df = pd.read_csv("heart_disease.csv")

# Define features and target
feature_columns = ['age', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
                   'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
x = df[feature_columns]
y = df['target']

# Split into training and testing sets
x_training, x_testing, y_training, y_testing = train_test_split(x, y, test_size=0.3, random_state=42)

# Standardize the features for models that need it
scaler = StandardScaler()
x_training_scaled = scaler.fit_transform(x_training)
x_testing_scaled = scaler.transform(x_testing)


In [3]:
#train and evaluate models

# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(x_training_scaled, y_training)
lr_preds = lr.predict(x_testing_scaled)
lr_acc = accuracy_score(y_testing, lr_preds)

# Gaussian Naive Bayes (doesn't require scaling)
gnb = GaussianNB()
gnb.fit(x_training, y_training)
gnb_preds = gnb.predict(x_testing)
gnb_acc = accuracy_score(y_testing, gnb_preds)

# K-Nearest Neighbors (requires scaling)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_training_scaled, y_training)
knn_preds = knn.predict(x_testing_scaled)
knn_acc = accuracy_score(y_testing, knn_preds)

# Print results
print("Logistic Regression Accuracy:", lr_acc)
print("Gaussian Naive Bayes Accuracy:", gnb_acc)
print("K-Nearest Neighbors Accuracy:", knn_acc)


Logistic Regression Accuracy: 0.8338762214983714
Gaussian Naive Bayes Accuracy: 0.7850162866449512
K-Nearest Neighbors Accuracy: 0.8306188925081434


In [4]:
#Choose best classifier

# Pick the best classifier
best_classifier = None

if lr_acc >= gnb_acc and lr_acc >= knn_acc:
    best_classifier = lr
    x_testing_final = x_testing_scaled
elif gnb_acc >= lr_acc and gnb_acc >= knn_acc:
    best_classifier = gnb
    x_testing_final = x_testing
else:
    best_classifier = knn
    x_testing_final = x_testing_scaled


In [5]:
# use classifier o predict unknown/un-labeled data

# Shuffle and use unseen data
x_testing_list = x_testing_final.tolist()
random.shuffle(x_testing_list)
unseen_data = x_testing_list[:20]

for data in unseen_data:
    prediction = best_classifier.predict(np.asarray(data).reshape(1, -1))

    if prediction[0] == 0:
        print("The patient is healthy")
    else:
        print("The patient is likely to have a heart disease")


The patient is likely to have a heart disease
The patient is healthy
The patient is healthy
The patient is likely to have a heart disease
The patient is likely to have a heart disease
The patient is likely to have a heart disease
The patient is likely to have a heart disease
The patient is healthy
The patient is healthy
The patient is healthy
The patient is healthy
The patient is likely to have a heart disease
The patient is healthy
The patient is likely to have a heart disease
The patient is likely to have a heart disease
The patient is likely to have a heart disease
The patient is healthy
The patient is healthy
The patient is likely to have a heart disease
The patient is likely to have a heart disease
