In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, log_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from imblearn.over_sampling import SMOTE
import plotly.express as px
import numpy as np
import optuna
import matplotlib.pyplot as plt
import warnings 
import seaborn as sns

# Dataset 3 Modeling

In [2]:
df = pd.read_csv('../data/processed/heart_dataset_3_processed.csv')
display(df.head(), df.shape, df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154 entries, 0 to 153
Data columns (total 52 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   age                                154 non-null    float64
 1   sex                                154 non-null    int64  
 2   locality                           154 non-null    int64  
 3   marital_status                     154 non-null    int64  
 4   sleep                              154 non-null    int64  
 5   depression                         154 non-null    int64  
 6   hyperlipi                          154 non-null    int64  
 7   smoking                            154 non-null    int64  
 8   family_history                     154 non-null    int64  
 9   f_history                          154 non-null    int64  
 10  diabetes                           154 non-null    int64  
 11  hypertension                       154 non-null    int64  

Unnamed: 0,age,sex,locality,marital_status,sleep,depression,hyperlipi,smoking,family_history,f_history,...,max_hr,exercise_angina,oldpeak,st_slope,ca,thalassemia,heart_disease_severity,streptokinase_therapy,reaction,HeartDisease
0,-1.195961,0,1,1,0,1,1,0,0,0,...,-0.177879,1,1.133205,2,0,7,2,1,0,0
1,-0.529856,0,0,1,0,1,1,0,0,0,...,0.089373,1,-0.255497,2,0,7,2,1,0,0
2,-0.085786,0,1,1,1,1,1,0,0,0,...,-1.024179,1,1.441806,2,0,3,2,1,0,0
3,-0.085786,0,1,1,1,1,1,0,0,0,...,-0.445132,1,0.361704,2,1,7,3,1,0,0
4,0.025231,0,1,1,0,1,1,0,0,0,...,-0.311506,1,1.904706,3,2,7,3,1,0,0


(154, 52)

None

In [3]:
px.imshow(df.corr(),title="Correlation Plot")

### Split the data

In [None]:
X = df.drop(columns=['HeartDisease'])
y = df['HeartDisease']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### Start with a simple Logsitic Regression Model

In [None]:
k_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
roc_accuracy_log = []

for fold, (train_index, test_index) in enumerate(k_folds.split(X_train, y_train)):
    classifier = LogisticRegression(max_iter=1000, random_state=42)
    classifier.fit(X_train.iloc[train_index], y_train.iloc[train_index])

    X_valid = X_train.iloc[test_index]
    y_valid = y_train.iloc[test_index]
    y_pred = classifier.predict(X_valid)

    print(f"\n📂 Fold {fold + 1}")
    print(classification_report(y_valid, y_pred))

    acc = roc_auc_score(y_valid, y_pred)
    roc_accuracy_log.append(acc)
    print(f"ROC AUC Score for Fold {fold + 1}: {acc:.4f}")

print("\nCross-Validation Summary:")
print(f"Mean ROC AUC: {np.mean(roc_accuracy_log):.4f}")
print(f"Standard Deviation: {np.std(roc_accuracy_log):.4f}")


📂 Fold 1
              precision    recall  f1-score   support

           0       0.86      0.90      0.88        21
           1       0.33      0.25      0.29         4

    accuracy                           0.80        25
   macro avg       0.60      0.58      0.58        25
weighted avg       0.78      0.80      0.79        25

ROC AUC Score for Fold 1: 0.5774

📂 Fold 2
              precision    recall  f1-score   support

           0       0.76      0.80      0.78        20
           1       0.00      0.00      0.00         5

    accuracy                           0.64        25
   macro avg       0.38      0.40      0.39        25
weighted avg       0.61      0.64      0.62        25

ROC AUC Score for Fold 2: 0.4000

📂 Fold 3
              precision    recall  f1-score   support

           0       0.86      0.90      0.88        20
           1       0.50      0.40      0.44         5

    accuracy                           0.80        25
   macro avg       0.68      0.6

In [6]:
print(y_train.value_counts())
print(y_train.value_counts(normalize=True))

HeartDisease
0    101
1     22
Name: count, dtype: int64
HeartDisease
0    0.821138
1    0.178862
Name: proportion, dtype: float64


### From the code above we can see that there is a significant imbalance and thats why the ROC Score is barely above 0.5
To fix use `class_weight=balanced` in updated Logistic Regression Model

In [8]:
k_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
roc_auc_log = []

for fold, (train_index, test_index) in enumerate(k_folds.split(X_train, y_train)):
    classifier = LogisticRegression(
        max_iter=1000,
        random_state=42,
        class_weight='balanced'  # 🔧 key fix
    )

    classifier.fit(X_train.iloc[train_index], y_train.iloc[train_index])

    X_valid = X_train.iloc[test_index]
    y_valid = y_train.iloc[test_index]
    y_pred = classifier.predict(X_valid)

    print(f"\n📂 Fold {fold + 1}")
    print(classification_report(y_valid, y_pred))

    auc = roc_auc_score(y_valid, y_pred)
    roc_auc_log.append(auc)
    print(f"ROC AUC Score for Fold {fold + 1}: {auc:.4f}")

print("\nCross-Validation Summary:")
print(f"Mean ROC AUC: {np.mean(roc_auc_log):.4f}")
print(f"Standard Deviation: {np.std(roc_auc_log):.4f}")


📂 Fold 1
              precision    recall  f1-score   support

           0       0.89      0.81      0.85        21
           1       0.33      0.50      0.40         4

    accuracy                           0.76        25
   macro avg       0.61      0.65      0.62        25
weighted avg       0.80      0.76      0.78        25

ROC AUC Score for Fold 1: 0.6548

📂 Fold 2
              precision    recall  f1-score   support

           0       0.72      0.65      0.68        20
           1       0.00      0.00      0.00         5

    accuracy                           0.52        25
   macro avg       0.36      0.33      0.34        25
weighted avg       0.58      0.52      0.55        25

ROC AUC Score for Fold 2: 0.3250

📂 Fold 3
              precision    recall  f1-score   support

           0       0.89      0.80      0.84        20
           1       0.43      0.60      0.50         5

    accuracy                           0.76        25
   macro avg       0.66      0.7

### We still have a very weak model even with `class_weight="balanced"` so lets try Random Forest Model

In [None]:
k_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
roc_auc_log = []

for fold, (train_idx, val_idx) in enumerate(k_folds.split(X_train, y_train)):
    model = RandomForestClassifier(
        n_estimators=100,
        max_depth=None,
        class_weight='balanced',  # Handle class imbalance
        random_state=42
    )

    model.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
    y_valid = y_train.iloc[val_idx]
    y_pred = model.predict(X_train.iloc[val_idx])

    print(f"\n📂 Fold {fold + 1}")
    print(classification_report(y_valid, y_pred, zero_division=0))

    auc = roc_auc_score(y_valid, y_pred)
    roc_auc_log.append(auc)
    print(f"ROC AUC Score for Fold {fold + 1}: {auc:.4f}")

print("\nCross-Validation Summary:")
print(f"Mean ROC AUC: {np.mean(roc_auc_log):.4f}")
print(f"Standard Deviation: {np.std(roc_auc_log):.4f}")


📂 Fold 1
              precision    recall  f1-score   support

           0       0.84      1.00      0.91        21
           1       0.00      0.00      0.00         4

    accuracy                           0.84        25
   macro avg       0.42      0.50      0.46        25
weighted avg       0.71      0.84      0.77        25

ROC AUC Score for Fold 1: 0.5000

📂 Fold 2
              precision    recall  f1-score   support

           0       0.80      1.00      0.89        20
           1       0.00      0.00      0.00         5

    accuracy                           0.80        25
   macro avg       0.40      0.50      0.44        25
weighted avg       0.64      0.80      0.71        25

ROC AUC Score for Fold 2: 0.5000

📂 Fold 3
              precision    recall  f1-score   support

           0       0.80      1.00      0.89        20
           1       0.00      0.00      0.00         5

    accuracy                           0.80        25
   macro avg       0.40      0.5

### Since this model is weaker than Logistic Regression let's go back to it and implement SMOTE include oversampling inside each fold to improve the class imbalance.

In [22]:
warnings.filterwarnings("ignore", category=FutureWarning)

k_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
roc_auc_log = []

for fold, (train_idx, val_idx) in enumerate(k_folds.split(X_train, y_train)):
    # Split fold
    X_fold_train, y_fold_train = X_train.iloc[train_idx], y_train.iloc[train_idx]
    X_fold_val, y_fold_val = X_train.iloc[val_idx], y_train.iloc[val_idx]

    # Apply SMOTE to training fold
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_fold_train, y_fold_train)

    # Train Logistic Regression
    model = LogisticRegression(max_iter=1000, random_state=42)
    model.fit(X_resampled, y_resampled)

    # Predict and Evaluate
    y_pred = model.predict(X_fold_val)

    print(f"\nFold {fold + 1}")
    print(classification_report(y_fold_val, y_pred, zero_division=0))

    auc = roc_auc_score(y_fold_val, y_pred)
    roc_auc_log.append(auc)
    print(f"ROC AUC Score for Fold {fold + 1}: {auc:.4f}")

print("\nCross-Validation Summary:")
print(f"Mean ROC AUC: {np.mean(roc_auc_log):.4f}")
print(f"Standard Deviation: {np.std(roc_auc_log):.4f}")


Fold 1
              precision    recall  f1-score   support

           0       0.83      0.90      0.86        21
           1       0.00      0.00      0.00         4

    accuracy                           0.76        25
   macro avg       0.41      0.45      0.43        25
weighted avg       0.69      0.76      0.73        25

ROC AUC Score for Fold 1: 0.4524

Fold 2
              precision    recall  f1-score   support

           0       0.76      0.80      0.78        20
           1       0.00      0.00      0.00         5

    accuracy                           0.64        25
   macro avg       0.38      0.40      0.39        25
weighted avg       0.61      0.64      0.62        25

ROC AUC Score for Fold 2: 0.4000

Fold 3
              precision    recall  f1-score   support

           0       0.89      0.80      0.84        20
           1       0.43      0.60      0.50         5

    accuracy                           0.76        25
   macro avg       0.66      0.70     

### Results above improved a bit but not enough -- let's move onto tuning with Optuna

In [17]:
def objective(trial):
    C = trial.suggest_float("C", 1e-3, 10, log=True)

    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = []

    for train_idx, val_idx in kfold.split(X_train, y_train):
        # Split
        X_fold_train, y_fold_train = X_train.iloc[train_idx], y_train.iloc[train_idx]
        X_fold_val, y_fold_val = X_train.iloc[val_idx], y_train.iloc[val_idx]

        # Oversample
        smote = SMOTE(random_state=42)
        X_resampled, y_resampled = smote.fit_resample(X_fold_train, y_fold_train)

        # Model
        model = LogisticRegression(C=C, max_iter=1000, random_state=42)
        model.fit(X_resampled, y_resampled)

        # Predict
        y_pred = model.predict(X_fold_val)
        auc = roc_auc_score(y_fold_val, y_pred)
        auc_scores.append(auc)

    return np.mean(auc_scores)


In [23]:
warnings.filterwarnings("ignore", category=FutureWarning)
study_name = "accuracy_dataset3"
storage_name = f"sqlite:///../optuna_database/{study_name}.db"

study = optuna.create_study(
    study_name=study_name,
    direction="maximize",
    storage=storage_name,
    load_if_exists=True
)
study.optimize(objective, n_trials=50)

print("Best ROC AUC:", study.best_value)
print("Best Parameters:", study.best_params)

[I 2025-05-02 15:54:51,291] Using an existing study with name 'accuracy_dataset3' instead of creating a new one.
[I 2025-05-02 15:54:51,377] Trial 100 finished with value: 0.6107142857142858 and parameters: {'C': 0.028403007320456666}. Best is trial 12 with value: 0.6207142857142858.
[I 2025-05-02 15:54:51,438] Trial 101 finished with value: 0.6207142857142858 and parameters: {'C': 0.07904102905948462}. Best is trial 12 with value: 0.6207142857142858.
[I 2025-05-02 15:54:51,501] Trial 102 finished with value: 0.6207142857142858 and parameters: {'C': 0.07540472129887091}. Best is trial 12 with value: 0.6207142857142858.
[I 2025-05-02 15:54:51,565] Trial 103 finished with value: 0.6004761904761905 and parameters: {'C': 0.11518722241725257}. Best is trial 12 with value: 0.6207142857142858.
[I 2025-05-02 15:54:51,634] Trial 104 finished with value: 0.6207142857142858 and parameters: {'C': 0.06566170070986842}. Best is trial 12 with value: 0.6207142857142858.
[I 2025-05-02 15:54:51,699] Tri

Best ROC AUC: 0.6207142857142858
Best Parameters: {'C': 0.06415279349625574}


### After tuning with Optuna, we got a significant increase to 0.62. To further improve ROC AUC scores let's try feature engineering

In [None]:
# age and smoking
X_train['age_smoke'] = X_train['age'] * X_train['smoking']
X_test['age_smoke'] = X_test['age'] * X_test['smoking']

# diabetes and blood glucose
X_train['diabetes_glucose'] = X_train['diabetes'] * X_train['blood_glucose_random']
X_test['diabetes_glucose'] = X_test['diabetes'] * X_test['blood_glucose_random']

# hypertension and blood pressure
X_train['hyper_bp'] = X_train['hypertension'] * X_train['blood_pressure']
X_test['hyper_bp'] = X_test['hypertension'] * X_test['blood_pressure']

# age and family history
X_train['age_family'] = X_train['age'] * X_train['family_history']
X_test['age_family'] = X_test['age'] * X_test['family_history']

# cholesterol and exercise angina
X_train['chol_angina'] = X_train['cholestorl'] * X_train['exercise_angina']
X_test['chol_angina'] = X_test['cholestorl'] * X_test['exercise_angina']


smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

model = LogisticRegression(C=0.06415279349625574, max_iter=1000, random_state=42)
model.fit(X_resampled, y_resampled)

y_pred = model.predict(X_test)
y_probs = model.predict_proba(X_test)[:, 1]

print("Test Set Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_pred))
print("Log Loss:", log_loss(y_test, y_probs))
print("Classification Report:\n", classification_report(y_test, y_pred))

Test Set Evaluation:
Accuracy: 0.6129032258064516
ROC AUC: 0.6076923076923078
Log Loss: 0.573794863562109
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.62      0.73        26
           1       0.23      0.60      0.33         5

    accuracy                           0.61        31
   macro avg       0.56      0.61      0.53        31
weighted avg       0.78      0.61      0.66        31



In [31]:
def objective(trial):
    # 🔧 Tune regularization strength (log scale)
    C = trial.suggest_float("C", 1e-3, 10, log=True)

    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = []

    for train_idx, val_idx in kf.split(X_train, y_train):
        X_fold_train = X_train.iloc[train_idx].copy()
        y_fold_train = y_train.iloc[train_idx].copy()
        X_fold_val = X_train.iloc[val_idx].copy()
        y_fold_val = y_train.iloc[val_idx].copy()

        # Recreate interaction features inside each fold
        X_fold_train['age_smoke'] = X_fold_train['age'] * X_fold_train['smoking']
        X_fold_train['diabetes_glucose'] = X_fold_train['diabetes'] * X_fold_train['blood_glucose_random']
        X_fold_train['hyper_bp'] = X_fold_train['hypertension'] * X_fold_train['blood_pressure']
        X_fold_train['age_family'] = X_fold_train['age'] * X_fold_train['family_history']
        X_fold_train['chol_angina'] = X_fold_train['cholestorl'] * X_fold_train['exercise_angina']

        X_fold_val['age_smoke'] = X_fold_val['age'] * X_fold_val['smoking']
        X_fold_val['diabetes_glucose'] = X_fold_val['diabetes'] * X_fold_val['blood_glucose_random']
        X_fold_val['hyper_bp'] = X_fold_val['hypertension'] * X_fold_val['blood_pressure']
        X_fold_val['age_family'] = X_fold_val['age'] * X_fold_val['family_history']
        X_fold_val['chol_angina'] = X_fold_val['cholestorl'] * X_fold_val['exercise_angina']

        # Apply SMOTE to balanced training fold
        smote = SMOTE(random_state=42)
        X_resampled, y_resampled = smote.fit_resample(X_fold_train, y_fold_train)

        # Train model
        model = LogisticRegression(C=C, max_iter=1000, random_state=42)
        model.fit(X_resampled, y_resampled)

        # Evaluate on validation fold
        y_pred = model.predict(X_fold_val)
        auc = roc_auc_score(y_fold_val, y_pred)
        auc_scores.append(auc)

    return np.mean(auc_scores)

In [32]:
study_name = "logreg_smote_interactions"
storage_name = f"sqlite:///../optuna_database/{study_name}.db"

study = optuna.create_study(
    study_name=study_name,
    direction="maximize",
    storage=storage_name,
    load_if_exists=True
)
study.optimize(objective, n_trials=50)

print("Best ROC AUC:", study.best_value)
print("Best Parameters:", study.best_params)

[I 2025-05-02 16:11:03,024] A new study created in RDB with name: logreg_smote_interactions
[I 2025-05-02 16:11:03,102] Trial 0 finished with value: 0.5957142857142858 and parameters: {'C': 0.040210239262555056}. Best is trial 0 with value: 0.5957142857142858.
[I 2025-05-02 16:11:03,173] Trial 1 finished with value: 0.5707142857142857 and parameters: {'C': 0.0496977795266348}. Best is trial 0 with value: 0.5957142857142858.
[I 2025-05-02 16:11:03,234] Trial 2 finished with value: 0.5804761904761906 and parameters: {'C': 0.16795096030701176}. Best is trial 0 with value: 0.5957142857142858.
[I 2025-05-02 16:11:03,290] Trial 3 finished with value: 0.5709523809523809 and parameters: {'C': 0.003954775639528052}. Best is trial 0 with value: 0.5957142857142858.
[I 2025-05-02 16:11:03,367] Trial 4 finished with value: 0.6004761904761905 and parameters: {'C': 0.3306933267612637}. Best is trial 4 with value: 0.6004761904761905.
[I 2025-05-02 16:11:03,446] Trial 5 finished with value: 0.580476190

Best ROC AUC: 0.6007142857142858
Best Parameters: {'C': 0.6533295716735373}


### Try model with l1 penalty

In [None]:
# SMOTE again (same as before)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# L1 Logistic Regression model for feature selection
l1_model = LogisticRegression(
    C=0.6533,  # use best C from Optuna or re-tune for L1 if needed
    penalty='l1',
    solver='liblinear',
    max_iter=1000,
    random_state=42
)

# Fit the model
l1_model.fit(X_resampled, y_resampled)

# Select important features
selector = SelectFromModel(l1_model, prefit=True)
X_train_l1 = selector.transform(X_train)
X_test_l1 = selector.transform(X_test)

print(f"Selected {X_train_l1.shape[1]} features out of {X_train.shape[1]}")

model = LogisticRegression(
    C=0.6533,
    penalty='l1',
    solver='liblinear',
    max_iter=1000,
    random_state=42
)

# Train again on the reduced feature set
smote = SMOTE(random_state=42)
X_res_l1, y_res_l1 = smote.fit_resample(X_train_l1, y_train)
model.fit(X_res_l1, y_res_l1)

# Evaluate
y_pred = model.predict(X_test_l1)
y_probs = model.predict_proba(X_test_l1)[:, 1]

print("L1 Test Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_pred))
print("Log Loss:", log_loss(y_test, y_probs))
print("Classification Report:\n", classification_report(y_test, y_pred))

Selected 28 features out of 56
L1 Test Evaluation:
Accuracy: 0.6774193548387096
ROC AUC: 0.726923076923077
Log Loss: 0.8431390202035934
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.65      0.77        26
           1       0.31      0.80      0.44         5

    accuracy                           0.68        31
   macro avg       0.63      0.73      0.61        31
weighted avg       0.84      0.68      0.72        31




X has feature names, but SelectFromModel was fitted without feature names


X has feature names, but SelectFromModel was fitted without feature names

