In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import xgboost as xgb
import joblib
import sys
from pathlib import Path

# Add parent directory to path to import config
sys.path.append(str(Path().resolve().parent))
from config import DATA_DIR, MODELS_DIR

data_folder = str(DATA_DIR) + "\\"

In [38]:
# Load all datasets for supervised learning analysis
from config import (CLEANED_DATA_PATH, UNION_CLEANED_DATA_PATH, 
                    PCA_13_COMPONENTS_PATH, PCA_9_COMPONENTS_PATH, 
                    RF_SELECTED_DATA_PATH)

# Load PCA datasets
df_pca = pd.read_csv(PCA_13_COMPONENTS_PATH)
X_pca = df_pca.drop(columns=['target'])
y_pca = df_pca['target']

df_pca9 = pd.read_csv(PCA_9_COMPONENTS_PATH)
X_pca9 = df_pca9.drop(columns=['target'])
y_pca9 = df_pca9['target']

# Load feature selected datasets
df_union = pd.read_csv(UNION_CLEANED_DATA_PATH)
X_union = df_union.drop(columns=['target'])
y_union = df_union['target']

df_rf = pd.read_csv(RF_SELECTED_DATA_PATH)
X_rf = df_rf.drop(columns=['target'])
y_rf = df_rf['target']

# Create overlap dataset (intersection of union and rf features)
overlap_features = list(set(X_union.columns) & set(X_rf.columns))
df_overlap = df_union[overlap_features + ['target']]
X_overlap = df_overlap.drop(columns=['target'])
y_overlap = df_overlap['target']


# Create train/test splits for hyperparameter tuning (using RF dataset)
X_rf_train, X_rf_test, y_rf_train, y_rf_test = train_test_split(
    X_rf, y_rf, test_size=0.2, stratify=y_rf, random_state=42
)

# Create train/test splits for hyperparameter tuning (using union dataset)
X_union_train, X_union_test, y_union_train, y_union_test = train_test_split(
    X_union, y_union, test_size=0.2, stratify=y_union, random_state=42
)

print("Datasets loaded:")
print(f"PCA 13 components: {X_pca.shape}")
print(f"PCA 9 components: {X_pca9.shape}")
print(f"Union features: {X_union.shape}")
print(f"RF features: {X_rf.shape}")
print(f"Overlap features: {X_overlap.shape}")
print(f"\nTrain/Test splits for hyperparameter tuning:")
print(f"X_union_train: {X_union_train.shape}")
print(f"X_union_test: {X_union_test.shape}")

Datasets loaded:
PCA 13 components: (720, 13)
PCA 9 components: (720, 9)
Union features: (720, 20)
RF features: (720, 15)
Overlap features: (720, 15)

Train/Test splits for hyperparameter tuning:
X_union_train: (576, 20)
X_union_test: (144, 20)


In [39]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix
)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

def evaluate_models(X, y, dataset_name):
    print(f"\n📊 Binary Classification Results for: {dataset_name}")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000, class_weight='balanced'),
        "Decision Tree": DecisionTreeClassifier(class_weight='balanced'),
        "Random Forest": RandomForestClassifier(class_weight='balanced'),
        "SVM": SVC(probability=True, class_weight='balanced'),
        "xgb": XGBClassifier(use_label_encoder=False, eval_metric='logloss', scale_pos_weight=1.0)
    }

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1]  # Probability of class 1

        print(f"\n🔸 {name}")
        print("Accuracy: ", accuracy_score(y_test, y_pred))
        print("Precision:", precision_score(y_test, y_pred))
        print("Recall:   ", recall_score(y_test, y_pred))
        print("F1 Score: ", f1_score(y_test, y_pred))
        print("ROC AUC:  ", roc_auc_score(y_test, y_prob))
        print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
        print("\n ------- \n")


In [40]:
evaluate_models(X_pca, y_pca, "PCA (13 Components)")
print("\n \n ------------------------------------------ \n \n ")
evaluate_models(X_pca9, y_pca9, "PCA (9 Components)")
print("\n \n ------------------------------------------ \n \n ")
evaluate_models(X_union, y_union, "Random Forest Selected Features")
print("\n \n --------------------------------------- \n \n ")
evaluate_models(X_overlap, y_overlap, "Overlapped Features")
print("\n \n ------------------------------------- \n \n ")
evaluate_models(X_union, y_union, "Union Features")



📊 Binary Classification Results for: PCA (13 Components)

🔸 Logistic Regression
Accuracy:  0.8402777777777778
Precision: 0.8450704225352113
Recall:    0.8333333333333334
F1 Score:  0.8391608391608392
ROC AUC:   0.9070216049382717
Confusion Matrix:
 [[61 11]
 [12 60]]

 ------- 


🔸 Decision Tree
Accuracy:  0.75
Precision: 0.7432432432432432
Recall:    0.7638888888888888
F1 Score:  0.7534246575342466
ROC AUC:   0.75
Confusion Matrix:
 [[53 19]
 [17 55]]

 ------- 


🔸 Random Forest
Accuracy:  0.8194444444444444
Precision: 0.7948717948717948
Recall:    0.8611111111111112
F1 Score:  0.8266666666666667
ROC AUC:   0.8908179012345679
Confusion Matrix:
 [[56 16]
 [10 62]]

 ------- 


🔸 SVM
Accuracy:  0.8194444444444444
Precision: 0.8108108108108109
Recall:    0.8333333333333334
F1 Score:  0.821917808219178
ROC AUC:   0.9093364197530864
Confusion Matrix:
 [[58 14]
 [12 60]]

 ------- 


🔸 xgb
Accuracy:  0.8194444444444444
Precision: 0.7948717948717948
Recall:    0.8611111111111112
F1 Score: 

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



🔸 Random Forest
Accuracy:  0.8125
Precision: 0.8
Recall:    0.8333333333333334
F1 Score:  0.8163265306122449
ROC AUC:   0.9026813271604939
Confusion Matrix:
 [[57 15]
 [12 60]]

 ------- 


🔸 SVM
Accuracy:  0.7986111111111112
Precision: 0.7945205479452054
Recall:    0.8055555555555556
F1 Score:  0.8
ROC AUC:   0.9176311728395061
Confusion Matrix:
 [[57 15]
 [14 58]]

 ------- 


🔸 xgb
Accuracy:  0.7847222222222222
Precision: 0.7808219178082192
Recall:    0.7916666666666666
F1 Score:  0.7862068965517242
ROC AUC:   0.8894675925925927
Confusion Matrix:
 [[56 16]
 [15 57]]

 ------- 


 
 ------------------------------------------ 
 
 

📊 Binary Classification Results for: Random Forest Selected Features

🔸 Logistic Regression
Accuracy:  0.8402777777777778
Precision: 0.8266666666666667
Recall:    0.8611111111111112
F1 Score:  0.8435374149659864
ROC AUC:   0.9164737654320987
Confusion Matrix:
 [[59 13]
 [10 62]]

 ------- 


🔸 Decision Tree
Accuracy:  0.7430555555555556
Precision: 0.733333

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



🔸 Random Forest
Accuracy:  0.8611111111111112
Precision: 0.8611111111111112
Recall:    0.8611111111111112
F1 Score:  0.8611111111111112
ROC AUC:   0.9209104938271605
Confusion Matrix:
 [[62 10]
 [10 62]]

 ------- 


🔸 SVM
Accuracy:  0.8055555555555556
Precision: 0.8055555555555556
Recall:    0.8055555555555556
F1 Score:  0.8055555555555556
ROC AUC:   0.9224537037037037
Confusion Matrix:
 [[58 14]
 [14 58]]

 ------- 


🔸 xgb
Accuracy:  0.7916666666666666
Precision: 0.7837837837837838
Recall:    0.8055555555555556
F1 Score:  0.7945205479452054
ROC AUC:   0.875
Confusion Matrix:
 [[56 16]
 [14 58]]

 ------- 


 
 --------------------------------------- 
 
 

📊 Binary Classification Results for: Overlapped Features

🔸 Logistic Regression
Accuracy:  0.8402777777777778
Precision: 0.8450704225352113
Recall:    0.8333333333333334
F1 Score:  0.8391608391608392
ROC AUC:   0.9031635802469136
Confusion Matrix:
 [[61 11]
 [12 60]]

 ------- 


🔸 Decision Tree
Accuracy:  0.7638888888888888
Preci

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



🔸 Random Forest
Accuracy:  0.8541666666666666
Precision: 0.8493150684931506
Recall:    0.8611111111111112
F1 Score:  0.8551724137931035
ROC AUC:   0.9070216049382716
Confusion Matrix:
 [[61 11]
 [10 62]]

 ------- 


🔸 SVM
Accuracy:  0.8333333333333334
Precision: 0.8333333333333334
Recall:    0.8333333333333334
F1 Score:  0.8333333333333334
ROC AUC:   0.9054783950617283
Confusion Matrix:
 [[60 12]
 [12 60]]

 ------- 


🔸 xgb
Accuracy:  0.7777777777777778
Precision: 0.7777777777777778
Recall:    0.7777777777777778
F1 Score:  0.7777777777777778
ROC AUC:   0.8661265432098766
Confusion Matrix:
 [[56 16]
 [16 56]]

 ------- 


 
 ------------------------------------- 
 
 

📊 Binary Classification Results for: Union Features

🔸 Logistic Regression
Accuracy:  0.8402777777777778
Precision: 0.8266666666666667
Recall:    0.8611111111111112
F1 Score:  0.8435374149659864
ROC AUC:   0.9164737654320987
Confusion Matrix:
 [[59 13]
 [10 62]]

 ------- 


🔸 Decision Tree
Accuracy:  0.7291666666666666

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



🔸 Random Forest
Accuracy:  0.8680555555555556
Precision: 0.8732394366197183
Recall:    0.8611111111111112
F1 Score:  0.8671328671328671
ROC AUC:   0.9201388888888891
Confusion Matrix:
 [[63  9]
 [10 62]]

 ------- 


🔸 SVM
Accuracy:  0.8055555555555556
Precision: 0.8055555555555556
Recall:    0.8055555555555556
F1 Score:  0.8055555555555556
ROC AUC:   0.9224537037037037
Confusion Matrix:
 [[58 14]
 [14 58]]

 ------- 


🔸 xgb
Accuracy:  0.7916666666666666
Precision: 0.7837837837837838
Recall:    0.8055555555555556
F1 Score:  0.7945205479452054
ROC AUC:   0.875
Confusion Matrix:
 [[56 16]
 [14 58]]

 ------- 



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [41]:
# Hyperparameter Tuning for SVM, RF, XGB on the union Selected Features 
# Note: Using the train/test splits created above (X_union_train, X_union_test)

print("Starting hyperparameter tuning...")
print(f"Training on: {X_union_train.shape}")
print(f"Testing on: {X_union_test.shape}")
print(f"Features used: {list(X_union_train.columns)}")

Starting hyperparameter tuning...
Training on: (576, 20)
Testing on: (144, 20)
Features used: ['sex', 'slope_1.7017543859649122', 'age', 'exang', 'oldpeak', 'fbs', 'ca_2.0', 'thalach', 'trestbps', 'ca_0.6807817589576547', 'ca_1.0', 'ca_3.0', 'thal_7.0', 'chol', 'thal_4.985', 'cp_4.0', 'cp_2.0', 'slope_2.0', 'restecg_0.5682451253481894', 'cp_3.0']


In [42]:
logreg_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'penalty': ['l2'],  # saga supports l1/l2; lbfgs only l2
    'solver': ['saga', 'lbfgs'],
    'multi_class': ['ovr']
}

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score

logreg = LogisticRegression(class_weight='balanced', max_iter=1000)

logreg_search = GridSearchCV(logreg, logreg_grid, scoring=make_scorer(f1_score), cv=5, verbose=1, n_jobs=-1)
logreg_search.fit(X_union_train, y_union_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits




0,1,2
,estimator,LogisticRegre...max_iter=1000)
,param_grid,"{'C': [0.001, 0.01, ...], 'multi_class': ['ovr'], 'penalty': ['l2'], 'solver': ['saga', 'lbfgs']}"
,scoring,make_scorer(f...hod='predict')
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'saga'
,max_iter,1000


In [43]:
svm = SVC(class_weight='balanced', probability=True)

svm_grid = {
    'C': [0.01, 0.1, 1, 10],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf', 'poly']
}

svm_search = GridSearchCV(svm, svm_grid, scoring=make_scorer(f1_score), cv=5, verbose=1, n_jobs=-1)
svm_search.fit(X_union_train, y_union_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


0,1,2
,estimator,SVC(class_wei...bability=True)
,param_grid,"{'C': [0.01, 0.1, ...], 'gamma': ['scale', 'auto'], 'kernel': ['linear', 'rbf', ...]}"
,scoring,make_scorer(f...hod='predict')
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,C,1
,kernel,'rbf'
,degree,3
,gamma,'auto'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,'balanced'


In [47]:
rf = RandomForestClassifier(class_weight='balanced')

rf_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2, 3, 4]
}

rf_search = GridSearchCV(rf, rf_grid, scoring=make_scorer(f1_score), cv=5, verbose=1, n_jobs=-1)
# rf_search.fit(X_rf_train, y_rf_train)
rf_search.fit(X_union_train, y_union_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


0,1,2
,estimator,RandomForestC...ht='balanced')
,param_grid,"{'max_depth': [None, 5, ...], 'min_samples_leaf': [1, 2, ...], 'min_samples_split': [2, 5], 'n_estimators': [50, 100, ...]}"
,scoring,make_scorer(f...hod='predict')
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,4
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [48]:
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1],
    'scale_pos_weight': [1]  # can test >1 if class 1 is rare
}

grid = GridSearchCV(
    xgb,
    param_grid,
    scoring=make_scorer(f1_score),
    cv=5,
    verbose=1,
    n_jobs=-1
)

grid.fit(X_union_train, y_union_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0,1,2
,estimator,"XGBClassifier...ree=None, ...)"
,param_grid,"{'colsample_bytree': [0.8, 1], 'learning_rate': [0.01, 0.1, ...], 'max_depth': [3, 5, ...], 'n_estimators': [50, 100, ...], ...}"
,scoring,make_scorer(f...hod='predict')
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [49]:
from sklearn.metrics import classification_report, roc_auc_score

# Logistic Regression
print("🔍 Best Logistic Regression:", logreg_search.best_params_)
best_logreg = logreg_search.best_estimator_
y_pred = best_logreg.predict(X_union_test)
y_prob = best_logreg.predict_proba(X_union_test)[:, 1]
print("\n📊 Logistic Regression Classification Report:\n", classification_report(y_union_test, y_pred))
print("Logistic Regression ROC AUC:", roc_auc_score(y_union_test, y_prob))

# SVM
print("🔍 Best SVM:", svm_search.best_params_)
best_svm = svm_search.best_estimator_
y_pred_svm = best_svm.predict(X_union_test)
y_prob_svm = best_svm.predict_proba(X_union_test)[:, 1]
print("\n📊 SVM Classification Report:\n", classification_report(y_union_test, y_pred_svm))
print("🎯 SVM ROC AUC:", roc_auc_score(y_union_test, y_prob_svm))

# Random Forest
print("🔍 Best Random Forest:", rf_search.best_params_)
best_rf = rf_search.best_estimator_
y_pred_rf = best_rf.predict(X_union_test)
y_prob_rf = best_rf.predict_proba(X_union_test)[:, 1]
print("\n📊 RF Classification Report:\n", classification_report(y_union_test, y_pred_rf))
print("🎯 RF ROC AUC:", roc_auc_score(y_union_test, y_prob_rf))


# xgb
print("🔍 Best XGBoost Parameters:", grid.best_params_)
best_xgb = grid.best_estimator_
y_pred = best_xgb.predict(X_union_test)
y_prob = best_xgb.predict_proba(X_union_test)[:, 1]
print("\n📊 XGBoost Classification Report:\n", classification_report(y_union_test, y_pred))
print("🎯 XGBoost ROC AUC:", roc_auc_score(y_union_test, y_prob))




🔍 Best Logistic Regression: {'C': 1, 'multi_class': 'ovr', 'penalty': 'l2', 'solver': 'saga'}

📊 Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.82      0.84        72
           1       0.83      0.86      0.84        72

    accuracy                           0.84       144
   macro avg       0.84      0.84      0.84       144
weighted avg       0.84      0.84      0.84       144

Logistic Regression ROC AUC: 0.9162808641975309
🔍 Best SVM: {'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}

📊 SVM Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.82      0.82        72
           1       0.82      0.82      0.82        72

    accuracy                           0.82       144
   macro avg       0.82      0.82      0.82       144
weighted avg       0.82      0.82      0.82       144

🎯 SVM ROC AUC: 0.9239969135802468
🔍 Best Random Forest: {'max_depth': 1

In [54]:
from sklearn.ensemble import VotingClassifier

# Use your tuned models
voting_clf = VotingClassifier(
    estimators=[
        ('logreg', LogisticRegression(
            C=1, penalty='l2', solver='saga', multi_class='ovr',
            class_weight='balanced', max_iter=1000)),
        
        # ('svm', SVC(
        #     C=10, gamma='scale', kernel='linear',
        #     class_weight='balanced', probability=True)),
        
        ('rf', RandomForestClassifier(
            n_estimators=100, max_depth=10, min_samples_split=2,
            min_samples_leaf=4, class_weight='balanced')),
        
        # ('xgb', XGBClassifier(
        #    colsample_bytree=0.8, learning_rate=0.2, max_depth=3, 
        #    n_estimators=50, scale_pos_weight=1, subsample=0.8))
    ],
    voting='soft'
)


In [70]:
# Fit on the same train data (RF-selected features)
voting_clf.fit(X_union_train, y_union_train)

# Predictions
from sklearn.metrics import classification_report, roc_auc_score

y_pred_vote = voting_clf.predict(X_union_test)
y_prob_vote = voting_clf.predict_proba(X_union_test)[:, 1]

print("\n📊 Voting Classifier Classification Report:\n", classification_report(y_union_test, y_pred_vote))
print("🎯 Voting Classifier ROC AUC:", roc_auc_score(y_union_test, y_prob_vote))
print("\n confusion matrix: \n", confusion_matrix(y_union_test, y_pred_vote))

import joblib

# Save model using relative path
model_path = MODELS_DIR / "final_voting_model9.pkl"
joblib.dump(voting_clf, model_path)
print(f"✅ Model saved as {model_path}")


📊 Voting Classifier Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.83      0.85        72
           1       0.84      0.86      0.85        72

    accuracy                           0.85       144
   macro avg       0.85      0.85      0.85       144
weighted avg       0.85      0.85      0.85       144

🎯 Voting Classifier ROC AUC: 0.919945987654321

 confusion matrix: 
 [[60 12]
 [10 62]]
✅ Model saved as G:\Courses\Sprints_ML\Project\models\final_voting_model9.pkl




In [71]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(voting_clf, X_union, y_union, cv=5, scoring='accuracy')
print("Cross-val accuracy:", scores.mean())




Cross-val accuracy: 0.8166666666666667




In [72]:
# Save expected features to multiple locations
features_list = X_union_train.columns.tolist()

# Save to notebooks directory (for local use)
notebooks_features_path = Path().resolve() / "expected_features.txt"
with open(notebooks_features_path, "w") as f:
    f.write("\n".join(features_list))

# Save to UI directory (for the app)
ui_features_path = Path().resolve().parent / "ui" / "expected_features.txt"
with open(ui_features_path, "w") as f:
    f.write("\n".join(features_list))

print(f"✅ Expected features saved to {notebooks_features_path}")
print(f"✅ Expected features saved to {ui_features_path}")

✅ Expected features saved to G:\Courses\Sprints_ML\Project\notebooks\expected_features.txt
✅ Expected features saved to G:\Courses\Sprints_ML\Project\ui\expected_features.txt
