In [None]:
# Import necessary libraries for data manipulation, visualization, and machine learning
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import random
import statistics

# Scikit-learn libraries for machine learning algorithms and model evaluation
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import confusion_matrix, classification_report, recall_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

# XGBoost and optimization tools
import xgboost as xgb
from skopt import gbrt_minimize # Bayesian Optimization with GBM as surrogate
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args
from skopt import forest_minimize # Bayesian Optimization with RF as surrogate
from skopt import gp_minimize
import scipy.stats as stats

In [None]:
# Importing ROC-related libraries for calculating and plotting ROC curves
from sklearn.preprocessing import label_binarize
from itertools import cycle
from sklearn.preprocessing import LabelEncoder
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from scipy import interp

In [None]:
# Ignore warnings to prevent cluttering the output
import warnings
warnings.filterwarnings('ignore')

# Reading and Preparing Data

In [None]:
# Read data from the CSV file and transpose it. the dataset contains microRNA expression data

data_all = pd.read_csv("common_mirs_exp.csv", index_col="Unnamed: 0").T
data_all

In [None]:
data_all["disease"].value_counts()

In [None]:
# Read the test set from a tab-delimited file and transpose it

=pd.read_table('GSE29532_mirs_expression.txt').T
test_set

In [None]:
# Define scoring metrics for cross-validation (accuracy and ROC AUC)

scoring = ['accuracy', 'roc_auc']

In [None]:
# Separate features (X) and target (y) in the training dataset

X = data_all.drop(["disease"], axis=1).astype(float)
y = data_all["disease"]

In [None]:
X.shape, y.value_counts()

In [None]:
# Prepare the test set by separating the features (X_test) and target (y_test)

X_test = test_set.drop(['status'], axis = 1).astype(float)
y_test = test_set['status']
X_test.shape, y_test.shape

# Voters

## Preset Hyperparameters

In [None]:
# Initialize the GradientBoostingClassifier and perform cross-validation with Stratified KFold

GB=GradientBoostingClassifier(random_state=1)
cv=StratifiedKFold(n_splits=10)
scores = cross_validate(GB, X, y, scoring=scoring, cv=cv, n_jobs=-1, 
                        return_train_score=True, return_estimator=True)
scores1=pd.DataFrame(scores)
result=pd.DataFrame(scores1.mean(axis=0)[2:8], columns=['Gradiant Boosting']).T
result

In [None]:
# Fit the Gradient Boosting model on the training data and evaluate its performance on both training and test sets
GB = GradientBoostingClassifier(random_state=1)
GB.fit(X, y)
GB.score(X, y), GB.score(X_test, y_test)

In [None]:
# Predict on the test set and visualize the confusion matrix for the predictions

y_pred=GB.predict(X_test)
conf_mat = confusion_matrix(y_test,y_pred,labels=['Healthy', "MI"])
df_cm=pd.DataFrame(conf_mat,index=[i for i in ['Healthy', "MI"]],columns=[i for i in ['Predicted Healthy', 'Predicted MI']])
plt.rcParams["font.family"] = "Times New Roman"
plt.figure(figsize=(7,5))
plt.tick_params(axis='both', which='major', labelsize=17)
sns.heatmap(df_cm, annot=True, cmap="RdBu", cbar=False, square=True, annot_kws={'size': 22});
plt.savefig('Raw_GB_10.pdf', bbox_inches="tight")

In [None]:
# Print classification report for Gradient Boosting predictions

print(classification_report(y_test, y_pred))

In [None]:
# Initialize and cross-validate SVC with RBF kernel

SVC = SVC(random_state=1) 
cv=StratifiedKFold(n_splits=10)
#cv=cv_generator(10, dfs, 1)
scores = cross_validate(SVC, X, y, scoring=scoring, cv=cv, n_jobs=-1, 
                        return_train_score=True, return_estimator=True)
scores1=pd.DataFrame(scores)
result=pd.DataFrame(scores1.mean(axis=0)[2:8], columns=['SVC']).T
result

In [None]:
# Initialize and cross-validate XGBoost classifier

XGB = XGBClassifier(random_state=1)
cv=StratifiedKFold(n_splits=10)
scores = cross_validate(XGB, X, y, scoring=scoring, cv=cv, n_jobs=-1, 
                        return_train_score=True, return_estimator=True)
scores1=pd.DataFrame(scores)
result=pd.DataFrame(scores1.mean(axis=0)[2:8], columns=['XGBoost']).T
result

In [None]:
# Print classification report for XGBoost predictions

print(classification_report(y_test, y_pred))

### ROC

In [None]:
# Binarize the target labels (encode Healthy/MI as 0/1)

label = LabelEncoder()

y_temp = label.fit_transform(y)
y_temp = pd.get_dummies(y_temp)
y_temp.columns = ['Healthy', 'MI']
y = y_temp.iloc[:, 1].to_numpy()

In [None]:
# Stratified KFold for ROC analysis

cv = StratifiedKFold(n_splits=5)

In [None]:
# ROC analysis using Gradient Boosting classifier

classifier = GradientBoostingClassifier(random_state=1)
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

fig, ax = plt.subplots(figsize=(10, 9))
for fold, (train, test) in enumerate(cv.split(X, y)):
    classifier.fit(X.iloc[train, ], y[train])
    viz = RocCurveDisplay.from_estimator(
        classifier,
        X.iloc[test, ],
        y[test],
        name="_",
        alpha=0,
        lw=0,
        ax=ax,
    )
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)


mean_tpr_gb = np.mean(tprs, axis=0)
mean_tpr_gb[-1] = 1.0
mean_auc_gb = auc(mean_fpr, mean_tpr_gb)

In [None]:
# ROC analysis using XGBoost classifier

classifier = XGBClassifier(random_state=1)
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

fig, ax = plt.subplots(figsize=(10, 9))
for fold, (train, test) in enumerate(cv.split(X, y)):
    classifier.fit(X.iloc[train, ], y[train])
    viz = RocCurveDisplay.from_estimator(
        classifier,
        X.iloc[test, ],
        y[test],
        name="_",
        alpha=0,
        lw=0,
        ax=ax,
    )
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)


mean_tpr_xgb = np.mean(tprs, axis=0)
mean_tpr_xgb[-1] = 1.0
mean_auc_xgb = auc(mean_fpr, mean_tpr_xgb)

In [None]:
# ROC analysis using SVC classifier

classifier = SVC(random_state=1)
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

fig, ax = plt.subplots(figsize=(10, 9))
for fold, (train, test) in enumerate(cv.split(X, y)):
    classifier.fit(X.iloc[train, ], y[train])
    viz = RocCurveDisplay.from_estimator(
        classifier,
        X.iloc[test, ],
        y[test],
        name="_",
        alpha=0,
        lw=0,
        ax=ax,
    )
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)


mean_tpr_svc = np.mean(tprs, axis=0)
mean_tpr_svc[-1] = 1.0
mean_auc_svc = auc(mean_fpr, mean_tpr_svc)

In [None]:
# Setting up the plot for comparing ROC curves of different models (GB, XGB, SVM)

plt.figure(figsize=(10,9))
plt.rcParams["font.family"] = "Times"

# Plotting ROC curve for Gradient Boosting model

plt.plot(
    mean_fpr,
    mean_tpr_gb,
    color="red",
    label="%s (AUC=%.2f)" % ("GB", round(mean_auc_gb, 2)),
    lw=3,
    alpha=0.8,
)

# Plotting ROC curve for XGBoost model

plt.plot(
    mean_fpr,
    mean_tpr_xgb,
    color="green",
    label="%s (AUC=%.2f)" % ("XGB", round(mean_auc_xgb, 2)),
    lw=3,
    alpha=0.8,
)

# Plotting ROC curve for SVM model

plt.plot(
    mean_fpr,
    mean_tpr_svc,
    color="blue",
    label="%s (AUC=%.2f)" % ("SVM", round(mean_auc_svc, 2)),
    lw=3,
    alpha=0.8,
)

# Plotting a reference diagonal line representing random chance (AUC=0.5)

plt.plot([0, 1], [0, 1], 'k--', lw=2.3)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize = 24)
plt.ylabel('True Positive Rate', fontsize = 24)
plt.legend(loc="lower right", prop={'size':24})
plt.tick_params(axis='both', which='major', labelsize=20)
plt.savefig("base_models.pdf", bbox_inches="tight")

## Hypertuning Voter

In [None]:
# Split data into training and validation sets (70/30) with stratified sampling to maintain class proportions

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)
X_train.shape, X_val.shape

### SVM

In [None]:
# Defining the hyperparameter space for the SVM model

param_grid = [
    Real(0.1, 1000, prior='log-uniform', name='C'),
    Real(0.0001, 1, prior='log-uniform', name="gamma"),
    Categorical(['linear', 'rbf', 'poly'], name="kernel"),
    Integer(2, 5, name="degree"),

]

# Scikit-optimize parameter grid is a list
type(param_grid)

In [None]:
# Initialize the SVM model with class weight 'balanced' to handle class imbalance

gbm = SVC(random_state=1, class_weight="balanced")

In [None]:
# We design a function to maximize the accuracy, of a GBM,
# with cross-validation

# the decorator allows our objective function to receive the parameters as
# keyword arguments. This is a requirement for scikit-optimize.

@use_named_args(param_grid)
def objective(**params):
    
    # model with new parameters
    gbm.set_params(**params)

    # optimization function (hyperparam response function)
    value = np.mean(
        cross_val_score(
            gbm, 
            X_train,
            y_train,
            cv=StratifiedKFold(n_splits=5),
            n_jobs=6,
            scoring='recall'))

    # negate because we need to minimize
    return -value

In [None]:
# using GBMs as surrogate for f(x)

gbm_ = gbrt_minimize(
    objective, # the objective function to minimize
    param_grid, # the hyperparameter space
    n_initial_points=10, # the number of points to evaluate f(x) to start of
    acq_func='EI', # the acquisition function
    n_calls=100, # the number of subsequent evaluations of f(x)
    random_state=0, 
    n_jobs=6,
)

In [None]:
# Print the best score found by the optimizer (negated because of minimization)

"Best score=%.4f" % gbm_.fun

In [None]:
# Extract best parameters from the optimization result and set them to the SVM model

best_params = dict(zip([dim.name for dim in param_grid], gbm_.x))
SVC = gbm.set_params(**best_params)

# Cross-validate the model with the best parameters using 5-fold stratified sampling
cv=StratifiedKFold(n_splits=5)

scores = cross_validate(SVC, X, y, scoring=scoring, cv=cv, n_jobs=-1, 
                        return_train_score=True, return_estimator=True)
scores1=pd.DataFrame(scores)
result=pd.DataFrame(scores1.mean(axis=0)[2:6], columns=['SVC']).T
result

In [None]:
# Fit the SVM model with optimized parameters on the training data

SVC.fit(X_train, y_train)
SVC.score(X_train, y_train), SVC.score(X_val, y_val)

In [None]:
# Generate predictions on the validation set

y_pred=SVC.predict(X_val)
print(classification_report(y_val, y_pred))

In [None]:
# Fit the SVC model to the entire dataset and test it on the test set

SVC.fit(X, y)
SVC.score(X, y), SVC.score(X_test, y_test)

In [None]:
# Generate predictions on the test set and create a confusion matrix

y_pred=SVC.predict(X_test)
conf_mat = confusion_matrix(y_test,y_pred,labels=['Healthy', "MI"])
df_cm=pd.DataFrame(conf_mat,index=[i for i in ['Healthy', "MI"]],columns=[i for i in ['Predicted Healthy', 'Predicted MI']])
plt.rcParams["font.family"] = "Times New Roman"
plt.figure(figsize=(7,5))
plt.tick_params(axis='both', which='major', labelsize=17)
plt.tick_params(axis='both', which='major', labelsize=17)
sns.heatmap(df_cm, annot=True, cmap="RdBu", cbar=False, square=True, annot_kws={'size': 22});
plt.savefig('Hyper_SVC-poly_10.pdf', bbox_inches="tight")

In [None]:
print(classification_report(y_test, y_pred))

### Gradiant Boosted

In [None]:
# determine the hyperparameter space

param_grid = [
    Integer(10, 120, name="n_estimators"),
    Integer(2, 5, name="max_depth"),
    Real(0.0001, 0.1, prior='log-uniform', name='learning_rate'),
    Categorical(['deviance', 'exponential'], name="loss"),
]

# Scikit-optimize parameter grid is a list
type(param_grid)

In [None]:
# set up the gradient boosting classifier

gbm = GradientBoostingClassifier(random_state=0)

In [None]:
# We design a function to maximize the accuracy, of a GBM,
# with cross-validation

# the decorator allows our objective function to receive the parameters as
# keyword arguments. This is a requirement for scikit-optimize.

@use_named_args(param_grid)
def objective(**params):
    
    # model with new parameters
    gbm.set_params(**params)

    # optimization function (hyperparam response function)
    value = np.mean(
        cross_val_score(
            gbm, 
            X_train,
            y_train,
            cv=5,
            n_jobs=-4,
            scoring='recall')
    )

    # negate because we need to minimize
    return -value

In [None]:
# gbrt_minimize performs by Bayesian Optimization 
# using GBMs as surrogate for f(x)

gbm_ = gbrt_minimize(
    objective, # the objective function to minimize
    param_grid, # the hyperparameter space
    n_initial_points=10, # the number of points to evaluate f(x) to start of
    acq_func='EI', # the acquisition function
    n_calls=100, # the number of subsequent evaluations of f(x)
    random_state=0, 
    n_jobs=6,
)

In [None]:
# function value at the minimum.
# note that it is the negative of the accuracy

"Best score=%.4f" % gbm_.fun

In [None]:
best_params = dict(zip([dim.name for dim in param_grid], gbm_.x))
 = gbm.set_params(**best_params)

cv = StratifiedKFold(n_splits=5)
# cv=cv_generator(10, dfs, 1)
scores = cross_validate(
    GB,
    X,
    y,
    scoring=scoring,
    cv=cv,
    n_jobs=-1,
    return_train_score=True,
    return_estimator=True,
)
scores1 = pd.DataFrame(scores)
result = pd.DataFrame(scores1.mean(axis=0)[2:6], columns=["GB"]).T
result

In [None]:
GB.fit(X_train, y_train)
GB.score(X_train, y_train), GB.score(X_val, y_val)

In [None]:
y_pred=GB.predict(X_val)
print(classification_report(y_val, y_pred))

In [None]:
GB.fit(X, y)
GB.score(X, y), GB.score(X_test, y_test)

In [None]:
y_pred=GB.predict(X_test)
conf_mat = confusion_matrix(y_test,y_pred,labels=['Healthy', "MI"])
df_cm=pd.DataFrame(conf_mat,index=[i for i in ['Healthy', "MI"]],columns=[i for i in ['Predicted Healthy', 'Predicted MI']])

plt.rcParams["font.family"] = "Times New Roman"
plt.figure(figsize=(7,5))
plt.tick_params(axis='both', which='major', labelsize=17)
plt.tick_params(axis='both', which='major', labelsize=17)
sns.heatmap(df_cm, annot=True, cmap="RdBu", cbar=False, square=True, annot_kws={'size': 22});
plt.savefig('Hyper_GB_10.pdf', bbox_inches="tight")

In [None]:
print(classification_report(y_test, y_pred))

### XGBoost

In [None]:
# determine the hyperparameter space

param_grid = [
    Integer(10, 200, name='n_estimators'),
    Integer(2, 10, name='max_depth'),
    Real(0.01, 0.99, name='learning_rate'),
    Categorical(['gbtree', 'dart'], name='booster')
]

# Scikit-optimize parameter grid is a list
type(param_grid)

In [None]:
# set up the gradient boosting classifier

gbm = xgb.XGBClassifier(random_state=1)

In [None]:
# We design a function to maximize the accuracy, of a GBM,
# with cross-validation

# the decorator allows our objective function to receive the parameters as
# keyword arguments. This is a requirement of Scikit-Optimize.
@use_named_args(param_grid)
def objective(**params):
    
    # model with new parameters
    gbm.set_params(**params)

    # optimization function (hyperparam response function)
    value = np.mean(
        cross_val_score(
            gbm, 
            X_train,
            y_train,
            cv=5,
            n_jobs=6,
            scoring='recall')
    )

    # negate because we need to minimize
    return -value

In [None]:
# using a Marten Kernel

gp_ = gp_minimize(
    objective, # the objective function to minimize
    param_grid, # the hyperparameter space
    n_initial_points=10, # the number of points to evaluate f(x) to start of
    acq_func='EI', # the acquisition function
    n_calls=200, # the number of subsequent evaluations of f(x)
    random_state=0, 
)

In [None]:
# function value at the minimum.
# note that it is the negative of the accuracy

"Best score=%.4f" % gp_.fun

In [None]:
best_params = dict(zip([dim.name for dim in param_grid], gbm_.x))
XGB = gbm.set_params(**best_params)

cv=StratifiedKFold(n_splits=5)
#cv=cv_generator(10, dfs, 1)
scores = cross_validate(XGB, X, y, scoring=scoring, cv=cv, n_jobs=-1, 
                        return_train_score=True, return_estimator=True)
scores1=pd.DataFrame(scores)
result=pd.DataFrame(scores1.mean(axis=0)[2:6], columns=['XGB']).T
result

In [None]:
XGB.fit(X_train, y_train)
XGB.score(X_train, y_train), XGB.score(X_val, y_val)

In [None]:
y_pred=XGB.predict(X_val)
print(classification_report(y_val, y_pred))

In [None]:
XGB.fit(X, y)
XGB.score(X, y), XGB.score(X_test, y_test)

In [None]:
y_pred=XGB.predict(X_test)
conf_mat = confusion_matrix(y_test,y_pred,labels=['Healthy', "MI"])
df_cm=pd.DataFrame(conf_mat,index=[i for i in ['Healthy', "MI"]],columns=[i for i in ['Predicted Healthy', 'Predicted MI']])
plt.rcParams["font.family"] = "Times New Roman"
plt.figure(figsize=(7,5))
plt.tick_params(axis='both', which='major', labelsize=17)
plt.tick_params(axis='both', which='major', labelsize=17)
sns.heatmap(df_cm, annot=True, cmap="RdBu", cbar=False, square=True, annot_kws={'size': 22});
plt.savefig('Hyper_XGB_10.pdf', bbox_inches="tight")

In [None]:
print(classification_report(y_test, y_pred))

# Voting with Hypertuned Models

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
# Create a list of estimators for the VotingClassifierestimator = [] 
estimator.append(('GB', GB)) 
estimator.append(('XGP', XGB)) 
estimator.append(('SVC', SVC))

In [None]:
# Initialize VotingClassifier with hard voting method

hard_voting = VotingClassifier(estimators = estimator, voting ='hard') 
hard_voting.fit(X_train, y_train)
hard_voting.score(X_train, y_train), hard_voting.score(X_val, y_val)

In [None]:
# Predict and evaluate on validation set

y_pred=hard_voting.predict(X_val)
print(classification_report(y_val, y_pred))

In [None]:
# Re-train the VotingClassifier on the entire training set and evaluate on test set

hard_voting = VotingClassifier(estimators = estimator, voting ='hard') 
hard_voting.fit(X, y) 
hard_voting.score(X, y), hard_voting.score(X_test, y_test)

In [None]:
# Predict and generate confusion matrix for the test set

y_pred = hard_voting.predict(X_test) 
conf_mat = confusion_matrix(y_test,y_pred,labels=['Healthy', "MI"])
df_cm=pd.DataFrame(conf_mat,index=[i for i in ['Healthy', "MI"]],columns=[i for i in ['Predicted Healthy', 'Predicted MI']])
plt.rcParams["font.family"] = "Times New Roman"
plt.figure(figsize=(7,5))
plt.tick_params(axis='both', which='major', labelsize=17)
sns.heatmap(df_cm, annot=True, cmap="RdBu", cbar=False, square=True, annot_kws={'size': 22});
plt.savefig('Hyper_vote_10.pdf', bbox_inches="tight")

In [None]:
print(classification_report(y_test, y_pred))

## ROC for Hypertuned Models

In [None]:
# Encode target variable and prepare it for ROC curve plotting

label = LabelEncoder()

y_temp = label.fit_transform(y)
y_temp = pd.get_dummies(y_temp)
y_temp.columns = ['Healthy', 'MI']
y = y_temp.iloc[:, 1].to_numpy()

In [None]:
cv = StratifiedKFold(n_splits=5)

In [None]:
# ROC Curve for Gradient Boosting

classifier = GB
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

fig, ax = plt.subplots(figsize=(10, 9))
for fold, (train, test) in enumerate(cv.split(X, y)):
    classifier.fit(X.iloc[train, ], y[train])
    viz = RocCurveDisplay.from_estimator(
        classifier,
        X.iloc[test, ],
        y[test],
        name="_",
        alpha=0,
        lw=0,
        ax=ax,
    )
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)


mean_tpr_gb = np.mean(tprs, axis=0)
mean_tpr_gb[-1] = 1.0
mean_auc_gb = auc(mean_fpr, mean_tpr_gb)

In [None]:
# ROC Curve for XGBoost

classifier = XGB
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

fig, ax = plt.subplots(figsize=(10, 9))
for fold, (train, test) in enumerate(cv.split(X, y)):
    classifier.fit(X.iloc[train, ], y[train])
    viz = RocCurveDisplay.from_estimator(
        classifier,
        X.iloc[test, ],
        y[test],
        name="_",
        alpha=0,
        lw=0,
        ax=ax,
    )
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)


mean_tpr_xgb = np.mean(tprs, axis=0)
mean_tpr_xgb[-1] = 1.0
mean_auc_xgb = auc(mean_fpr, mean_tpr_xgb)

In [None]:
# ROC Curve for Support Vector Classification

classifier = SVC
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

fig, ax = plt.subplots(figsize=(10, 9))
for fold, (train, test) in enumerate(cv.split(X, y)):
    classifier.fit(X.iloc[train, ], y[train])
    y_pred=classifier.predict(X.iloc[test, ])
    viz = RocCurveDisplay.from_estimator(
        classifier,
        X.iloc[test, ],
        y[test],
        name="_",
        alpha=0,
        lw=0,
        ax=ax,
    )
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)


mean_tpr_svc = np.mean(tprs, axis=0)
mean_tpr_svc[-1] = 1.0
mean_auc_svc = auc(mean_fpr, mean_tpr_svc)

In [None]:
# ROC Curve for Hard Voting Ensemble

mean_tpr_vote = np.mean([mean_tpr_gb, mean_tpr_svc, mean_tpr_xgb], axis=0)
mean_tpr_vote[-1] = 1.0
mean_auc_vote = auc(mean_fpr, mean_tpr_vote)

In [None]:
# Plot ROC Curves for individual models and ensemble

plt.figure(figsize=(10,9))
plt.rcParams["font.family"] = "Times"

plt.plot(
    mean_fpr,
    mean_tpr_gb,
    color="red",
    label="%s (AUC=%.2f)" % ("GB", round(mean_auc_gb, 2)),
    lw=3,
    alpha=0.8,
)


plt.plot(
    mean_fpr,
    mean_tpr_xgb,
    color="green",
    label="%s (AUC=%.2f)" % ("XGB", round(mean_auc_xgb, 2)),
    lw=3,
    alpha=0.8,
)

plt.plot(
    mean_fpr,
    mean_tpr_svc,
    color="blue",
    label="%s (AUC=%.2f)" % ("SVM", round(mean_auc_svc, 2)),
    lw=3,
    alpha=0.8,
)

plt.plot(
    mean_fpr,
    mean_tpr_vote,
    color="gold",
    label="%s (AUC=%.2f)" % ("HVE", round(mean_auc_vote, 2)),
    lw=3,
    alpha=0.8,
)



plt.plot([0, 1], [0, 1], 'k--', lw=2.3)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize = 24)
plt.ylabel('True Positive Rate', fontsize = 24)
plt.legend(loc="lower right", prop={'size':24})
plt.tick_params(axis='both', which='major', labelsize=20)
plt.savefig("tuned_models_val2.pdf", bbox_inches="tight")

## ROC on REAL TEST SET

In [None]:
# Encode test target variable

label = LabelEncoder()

y_temp = label.fit_transform(y_test)
y_temp = pd.get_dummies(y_temp)
y_temp.columns = ['Healthy', 'MI']
y_test = y_temp.iloc[:, 1].to_numpy()

In [None]:
mean_fpr = np.linspace(0, 1, 100)

In [None]:
# ROC Curve for Gradient Boosting on test set

classifier = GB
classifier.fit(X, y)
y_pred=classifier.predict(X_test)
viz=RocCurveDisplay.from_predictions(y_test, y_pred)
interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
interp_tpr[0] = 0.0
tpr_gb=interp_tpr
auc_gb=viz.roc_auc

In [None]:
# ROC Curve for Support Vector Classification on test set

classifier = SVC
classifier.fit(X, y)
y_pred=classifier.predict(X_test)
viz=RocCurveDisplay.from_predictions(y_test, y_pred)
interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
interp_tpr[0] = 0.0
tpr_svc=interp_tpr
auc_svc=viz.roc_auc

In [None]:
# ROC Curve for XGBoost on test set

classifier = XGB
classifier.fit(X, y)
y_pred=classifier.predict(X_test)
viz=RocCurveDisplay.from_predictions(y_test, y_pred)
interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
interp_tpr[0] = 0.0
tpr_xgb=interp_tpr
auc_xgb=viz.roc_auc

In [None]:
# ROC Curve for Voting Classifier on test set

classifier = VotingClassifier(estimators = estimator, voting ='hard') 
classifier.fit(X, y)
y_pred=classifier.predict(X_test)
viz=RocCurveDisplay.from_predictions(y_test, y_pred)
interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
interp_tpr[0] = 0.0
tpr_vote=interp_tpr
auc_vote=viz.roc_auc

In [None]:
plt.figure(figsize=(10,9))
plt.rcParams["font.family"] = "Times"

plt.plot(
    mean_fpr,
    tpr_gb,
    color="red",
    label="%s (AUC=%.2f)" % ("GB", round(auc_gb, 2)),
    lw=3.5,
    alpha=0.8,
)


plt.plot(
    mean_fpr,
    tpr_xgb,
    color="green",
    label="%s (AUC=%.2f)" % ("XGB", round(auc_xgb, 2)),
    lw=3,
    alpha=0.8,
)

plt.plot(
    mean_fpr,
    tpr_svc,
    color="blue",
    label="%s (AUC=%.2f)" % ("SVM", round(auc_svc, 2)),
    lw=3,
    alpha=0.5,
)

plt.plot(
    mean_fpr,
    tpr_vote,
    color="gold",
    label="%s (AUC=%.2f)" % ("HVE", round(auc_vote, 2)),
    lw=2.5,
    alpha=0.8, 
)



plt.plot([0, 1], [0, 1], 'k--', lw=2.3)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize = 24)
plt.ylabel('True Positive Rate', fontsize = 24)
plt.legend(loc="lower right", prop={'size':24})
plt.tick_params(axis='both', which='major', labelsize=20)
plt.savefig("tuned_models_test2.pdf", bbox_inches="tight")