# Modeling
Build, train and tune ML models to predict heart disease using preprocessed data

In [1]:
# Load Libaries
import pandas as pd
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Load the saved data splits
with open("split_data/x_train.pkl", "rb") as f:
    x_train = pickle.load(f)

with open("split_data/y_train.pkl", "rb") as f:
    y_train = pickle.load(f)

with open("split_data/x_test.pkl", "rb") as f:
    x_test = pickle.load(f)

with open("split_data/y_test.pkl", "rb") as f:
    y_test = pickle.load(f)

---
## Dealing with Class Imbalance

Note that the dataset is imbalanced, where most entries belong to the "Absent" class for heart disease status. It is important to deal with this imbalance as it may cause the model to be bias and predict the majority class more. Oversampling to balance the classes in the training set will allow the model to better learn patterns associated with the minority class.

In [2]:
# Combine x_train and y_train for oversampling
training_data = pd.concat([x_train, y_train], axis = 1)

# Oversampling (for equal majority and minority class)
majority_class = training_data[training_data["heart_disease_status"] == "Absent"]
minority_class = training_data[training_data["heart_disease_status"] == "Present"]
minority_oversampled = minority_class.sample(n = len(majority_class), replace = True, random_state = 1)

# New training data
training_data_balanced = pd.concat([majority_class, minority_oversampled])
training_data_balanced = training_data_balanced.sample(frac = 1, random_state = 1).reset_index(drop = True)

# Separate balanced training data
x_train_balanced = training_data_balanced.drop(columns = ["heart_disease_status"])
y_train_balanced = training_data_balanced["heart_disease_status"]

# After oversampling
print("Balanced training set distribution:")
print(y_train_balanced.value_counts())

Balanced training set distribution:
heart_disease_status
Absent     6400
Present    6400
Name: count, dtype: int64


## Training and Comparing Models
Cross-validation will be used to compare each model and evaluate their performances on the training data.

1. Logistic Regression

In [3]:
pipe_lr = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(max_iter=1000, random_state=1))
])

scores_lr = cross_val_score(pipe_lr, x_train_balanced, y_train_balanced, cv=5, scoring="accuracy")
print(f"Logistic Regression CV Accuracy: {scores_lr.mean():.4f}")


Logistic Regression CV Accuracy: 0.5084


2. Random Forest

In [4]:
pipe_rf = Pipeline([
    ("model", RandomForestClassifier(random_state=1))
])

scores_rf = cross_val_score(pipe_rf, x_train_balanced, y_train_balanced, cv=5, scoring="accuracy")
print(f"Random Forest CV Accuracy: {scores_rf.mean():.4f}")

Random Forest CV Accuracy: 0.9798


3. Support Vector Machine (SVC)

In [5]:
pipe_svc = Pipeline([
    ("scaler", StandardScaler()),
    ("model", SVC(probability=True, random_state=1))
])

scores_svc = cross_val_score(pipe_svc, x_train_balanced, y_train_balanced, cv=5, scoring="accuracy")
print(f"SVM CV Accuracy: {scores_svc.mean():.4f}")

SVM CV Accuracy: 0.7039


4. K-Nearest Neighbours (KNN)

In [6]:
from sklearn.neighbors import KNeighborsClassifier

pipe_knn = Pipeline([
    ("scaler", StandardScaler()),
    ("model", KNeighborsClassifier(n_neighbors=5))
])

scores_knn = cross_val_score(pipe_knn, x_train_balanced, y_train_balanced, cv=5, scoring="accuracy")
print(f"KNN CV Accuracy: {scores_knn.mean():.4f}")

KNN CV Accuracy: 0.6945


✅ 1. Modeling Notebook
Purpose: Build, train, and tune machine learning models.

What to include:

Load preprocessed & split data (e.g., from a .pkl file)

Oversample training data

Train multiple models (e.g., Random Forest, Logistic Regression)

Use cross-validation

Hyperparameter tuning (GridSearchCV, RandomizedSearchCV)

Select the best model based on metrics

Make predictions on test data

Save the best model