scale and smote before 10-fold cross-validation for comparison

In [89]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore', category = FutureWarning)

In [90]:
# Import pandas libraries
import pandas as pd

# Load the dataset
data = pd.read_csv("Data/activemq_result.csv", delimiter=',')
# data = pd.read_csv("Data/avro_result.csv", delimiter=',')


# Explore the dataset
print(data.head()) 



                                         Method name  C20  C3  C4  C1  C5  C6  \
0  org.apache.activemq.transport.amqp.AmqpFramePa...    4  10   1   9   2   5   
1  org.apache.activemq.transport.amqp.AmqpHeader....    5   6   0   6   1   3   
2  org.apache.activemq.transport.amqp.AmqpHeader....    1  13   0  13   3   9   
3  org.apache.activemq.transport.amqp.AmqpInactiv...    1   5   0   5   1   3   
4  org.apache.activemq.transport.amqp.AmqpInactiv...    6   9   0   9   1   5   

   C2  C21  C18  ...  H4  H1  H2  H3       H12       H13       H14   H15  H5  \
0   0    2    4  ...   1   2   2   4  0.200000  0.200000  1.000000   4.0   1   
1   0    5    2  ...   1   1   3   4  0.333333  1.000000  0.333333   4.0   1   
2   0    9    3  ...   1  10   0  10  0.769231  0.000000  0.000000  10.0   1   
3   0    3    1  ...   1   2   0   2  0.400000  0.000000  0.000000   2.0   1   
4   0    4    3  ...   1   3   2   5  0.333333  0.222222  1.500000   5.0   1   

       bug-prone  
0  not bug-pr

In [91]:
# copy data
data_transform = data.copy()

In [92]:

# Convert 'bug-prone' column to 0 and 1
data_transform['bug-prone'] = data_transform['bug-prone'].apply(lambda x: 1 if x.strip() == 'bug-prone' else 0)

# Display the updated DataFrame
print(data_transform.head()) 


                                         Method name  C20  C3  C4  C1  C5  C6  \
0  org.apache.activemq.transport.amqp.AmqpFramePa...    4  10   1   9   2   5   
1  org.apache.activemq.transport.amqp.AmqpHeader....    5   6   0   6   1   3   
2  org.apache.activemq.transport.amqp.AmqpHeader....    1  13   0  13   3   9   
3  org.apache.activemq.transport.amqp.AmqpInactiv...    1   5   0   5   1   3   
4  org.apache.activemq.transport.amqp.AmqpInactiv...    6   9   0   9   1   5   

   C2  C21  C18  ...  H4  H1  H2  H3       H12       H13       H14   H15  H5  \
0   0    2    4  ...   1   2   2   4  0.200000  0.200000  1.000000   4.0   1   
1   0    5    2  ...   1   1   3   4  0.333333  1.000000  0.333333   4.0   1   
2   0    9    3  ...   1  10   0  10  0.769231  0.000000  0.000000  10.0   1   
3   0    3    1  ...   1   2   0   2  0.400000  0.000000  0.000000   2.0   1   
4   0    4    3  ...   1   3   2   5  0.333333  0.222222  1.500000   5.0   1   

   bug-prone  
0          0  
1 

In [93]:
print(data_transform.shape)

(4016, 42)


In [94]:

# split feature data and target data
feature_X = data_transform.drop(columns=['Method name','bug-prone'])
y = data_transform['bug-prone']



In [95]:
from imblearn.combine import SMOTETomek
from sklearn.preprocessing import StandardScaler

# Apply SMOTETomek first (on original feature space)
smote_tomek = SMOTETomek(random_state=42)
X_resampled, y_resampled = smote_tomek.fit_resample(feature_X, y)

# Apply StandardScaler after resampling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_resampled)

X = X_scaled
y = y_resampled

In [96]:
print(X.shape)

(4218, 40)


In [97]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.neighbors import RadiusNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
# from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

models = {
    
    "RandomForest" :  RandomForestClassifier(n_estimators=500, max_depth=None, min_samples_split=5, 
                                       min_samples_leaf=2, class_weight='balanced', random_state=42),
    "AdaBoostClassifier" : AdaBoostClassifier(n_estimators=100, learning_rate=0.8, random_state=42),
    "BaggingClassifier" : BaggingClassifier(n_estimators=100, max_samples=0.8, random_state=42),
    "KNeighborsClassifier" : KNeighborsClassifier(n_neighbors=3, weights='distance', metric='manhattan'),
    "MLPClassifier" : MLPClassifier(activation='relu', hidden_layer_sizes=(200,100), max_iter=2000, 
                               learning_rate='adaptive', random_state=42),
    # "GradientBoostingClassifier" : GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=4, random_state=42),
    "HistGradientBoostingClassifier" : HistGradientBoostingClassifier(random_state=42),   
    "DecisionTreeClassifier" : DecisionTreeClassifier(random_state=42),  # max_depth=10, min_samples_split=10, min_samples_leaf=5, 
    "SVC" : SVC(random_state=42, probability=True, C=10, kernel='poly', gamma='scale'),
    # "GaussianNB" : GaussianNB(var_smoothing=1e-9),
    # "LogisticRegression" : LogisticRegression(class_weight='balanced', random_state=42),
    "XGBoost": XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=4, random_state=42),
    # "LightGBM": LGBMClassifier(n_estimators=200, learning_rate=0.05, max_depth=4, random_state=42),
    "CatBoost": CatBoostClassifier(iterations=200, learning_rate=0.05, depth=4, verbose=0, random_state=42)
}




In [98]:
# Initialize cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Store metrics
accuracy_scores, precision_scores, recall_scores, f1_scores, mcc_scores, auc_scores = [], [], [], [], [], []

# Perform 10-Fold Cross-Validation
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    probabilities = []
    max_scores = []
    
    # Train all models and get probabilities
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred_proba = model.predict_proba(X_test)[:, 1]  # Probability for class 1
        probabilities.append(y_pred_proba)
        max_scores.append(roc_auc_score(y_test, y_pred_proba))  # Use AUC as weight
    
    # Average probabilities for small group
    avg_proba = np.mean(probabilities, axis=0)

    from sklearn.metrics import (
        roc_curve, roc_auc_score, accuracy_score, precision_score,
        recall_score, f1_score, matthews_corrcoef
    )
    

    def get_optimal_threshold(y_true, y_proba):
        """
        Compute the optimal threshold using Youden's Index.
        """
        fpr, tpr, thresholds = roc_curve(y_true, y_proba)
        youden_index = tpr - fpr
        optimal_idx = np.argmax(youden_index)
        optimal_threshold = thresholds[optimal_idx]
        return optimal_threshold

    best_threshold_pr = get_optimal_threshold(y_test, avg_proba)

    # Convert probabilities to binary predictions using threshold
    predictions = (avg_proba > best_threshold_pr).astype(int)

    # Compute performance metrics
    accuracy_scores.append(accuracy_score(y_test, predictions))
    precision_scores.append(precision_score(y_test, predictions))
    recall_scores.append(recall_score(y_test, predictions))
    f1_scores.append(f1_score(y_test, predictions))
    mcc_scores.append(matthews_corrcoef(y_test, predictions))
    auc_scores.append(roc_auc_score(y_test, avg_proba))

# Print final results (average over 10 folds)
print(f"Final 10-Fold CV Results:")
print(f"Accuracy: {np.mean(accuracy_scores):.4f} ± {np.std(accuracy_scores):.4f}")
print(f"Precision: {np.mean(precision_scores):.4f} ± {np.std(precision_scores):.4f}")
print(f"Recall: {np.mean(recall_scores):.4f} ± {np.std(recall_scores):.4f}")
print(f"F1-Score: {np.mean(f1_scores):.4f} ± {np.std(f1_scores):.4f}")
print(f"MCC: {np.mean(mcc_scores):.4f} ± {np.std(mcc_scores):.4f}")
print(f"AUC: {np.mean(auc_scores):.4f} ± {np.std(auc_scores):.4f}")

# Accuracy: 0.8569 ± 0.0339
# Precision: 0.8623 ± 0.0439
# Recall: 0.8527 ± 0.0641
# F1-Score: 0.8555 ± 0.0372
# MCC: 0.7172 ± 0.0670
# AUC: 0.9186 ± 0.0274

Final 10-Fold CV Results:
Accuracy: 0.8400 ± 0.0152
Precision: 0.8330 ± 0.0328
Recall: 0.8539 ± 0.0344
F1-Score: 0.8422 ± 0.0133
MCC: 0.6823 ± 0.0291
AUC: 0.9182 ± 0.0104


In [99]:
# Initialize cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Store metrics
accuracy_scores, precision_scores, recall_scores, f1_scores, mcc_scores, auc_scores = [], [], [], [], [], []

# Perform 10-Fold Cross-Validation
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    probabilities = []
    max_scores = []
    
    # Train all models and get probabilities
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred_proba = model.predict_proba(X_test)[:, 1]  # Probability for class 1
        probabilities.append(y_pred_proba)
        max_scores.append(roc_auc_score(y_test, y_pred_proba))  # Use AUC as weight
    
    # Average probabilities for small group
    avg_proba = np.mean(probabilities, axis=0)

    from sklearn.metrics import (
        roc_curve, roc_auc_score, accuracy_score, precision_score,
        recall_score, f1_score, matthews_corrcoef
    )
    

    # Determine best threshold using precision-recall
    from sklearn.metrics import precision_recall_curve

    def get_best_threshold_precision_recall(y_true, y_proba):
        precision, recall, thresholds = precision_recall_curve(y_true, y_proba)
        # f1_scores = 2 * (precision * recall) / (precision + recall)
        # f1_scores = np.nan_to_num(f1_scores)
        f1_scores = np.where((precision + recall) == 0, 0, 2 * (precision * recall) / (precision + recall))
        best_threshold = thresholds[np.argmax(f1_scores)]
        return best_threshold

    best_threshold_pr = get_best_threshold_precision_recall(y_test, avg_proba)

    # Convert probabilities to binary predictions using threshold
    predictions = (avg_proba > best_threshold_pr).astype(int)

    # Compute performance metrics
    accuracy_scores.append(accuracy_score(y_test, predictions))
    precision_scores.append(precision_score(y_test, predictions))
    recall_scores.append(recall_score(y_test, predictions))
    f1_scores.append(f1_score(y_test, predictions))
    mcc_scores.append(matthews_corrcoef(y_test, predictions))
    auc_scores.append(roc_auc_score(y_test, avg_proba))

# Print final results (average over 10 folds)
print(f"Final 10-Fold CV Results:")
print(f"Accuracy: {np.mean(accuracy_scores):.4f} ± {np.std(accuracy_scores):.4f}")
print(f"Precision: {np.mean(precision_scores):.4f} ± {np.std(precision_scores):.4f}")
print(f"Recall: {np.mean(recall_scores):.4f} ± {np.std(recall_scores):.4f}")
print(f"F1-Score: {np.mean(f1_scores):.4f} ± {np.std(f1_scores):.4f}")
print(f"MCC: {np.mean(mcc_scores):.4f} ± {np.std(mcc_scores):.4f}")
print(f"AUC: {np.mean(auc_scores):.4f} ± {np.std(auc_scores):.4f}")

# Accuracy: 0.8555 ± 0.0366
# Precision: 0.8422 ± 0.0603
# Recall: 0.8847 ± 0.0624
# F1-Score: 0.8596 ± 0.0330
# MCC: 0.7183 ± 0.0677
# AUC: 0.9186 ± 0.0274

Final 10-Fold CV Results:
Accuracy: 0.8352 ± 0.0170
Precision: 0.8008 ± 0.0369
Recall: 0.8981 ± 0.0393
F1-Score: 0.8451 ± 0.0134
MCC: 0.6788 ± 0.0293
AUC: 0.9182 ± 0.0104


In [100]:
# Initialize cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Store metrics
accuracy_scores, precision_scores, recall_scores, f1_scores, mcc_scores, auc_scores = [], [], [], [], [], []

# Perform 10-Fold Cross-Validation
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    probabilities = []
    max_scores = []
    
    # Train all models and get probabilities
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred_proba = model.predict_proba(X_test)[:, 1]  # Probability for class 1
        probabilities.append(y_pred_proba)
        max_scores.append(roc_auc_score(y_test, y_pred_proba))  # Use AUC as weight
    
    # Normalize AUC scores to get weights
    weights = [score / sum(max_scores) for score in max_scores]

    combined_probs = np.sum([weight * probs for weight, probs in zip(weights, probabilities)], axis=0)

    # Determine best threshold using precision-recall
    from sklearn.metrics import precision_recall_curve

    def get_best_threshold_precision_recall(y_true, y_proba):
        precision, recall, thresholds = precision_recall_curve(y_true, y_proba)
        # f1_scores = 2 * (precision * recall) / (precision + recall)
        f1_scores = np.where((precision + recall) == 0, 0, 2 * (precision * recall) / (precision + recall))
        # f1_scores = np.nan_to_num(f1_scores)
        best_threshold = thresholds[np.argmax(f1_scores)]
        return best_threshold

    best_threshold_pr = get_best_threshold_precision_recall(y_test, combined_probs)
    # adjusted_threshold = best_threshold_pr + 0.05

    # Convert probabilities to binary predictions using threshold
    predictions = (combined_probs > best_threshold_pr).astype(int)

    # Compute performance metrics
    accuracy_scores.append(accuracy_score(y_test, predictions))
    precision_scores.append(precision_score(y_test, predictions))
    recall_scores.append(recall_score(y_test, predictions))
    f1_scores.append(f1_score(y_test, predictions))
    mcc_scores.append(matthews_corrcoef(y_test, predictions))
    auc_scores.append(roc_auc_score(y_test, combined_probs))

# Print final results (average over 10 folds)
print(f"Final 10-Fold CV Results:")
print(f"Accuracy: {np.mean(accuracy_scores):.4f} ± {np.std(accuracy_scores):.4f}")
print(f"Precision: {np.mean(precision_scores):.4f} ± {np.std(precision_scores):.4f}")
print(f"Recall: {np.mean(recall_scores):.4f} ± {np.std(recall_scores):.4f}")
print(f"F1-Score: {np.mean(f1_scores):.4f} ± {np.std(f1_scores):.4f}")
print(f"MCC: {np.mean(mcc_scores):.4f} ± {np.std(mcc_scores):.4f}")
print(f"AUC: {np.mean(auc_scores):.4f} ± {np.std(auc_scores):.4f}")


# Accuracy: 0.8540 ± 0.0396
# Precision: 0.8398 ± 0.0656
# Recall: 0.8876 ± 0.0656
# F1-Score: 0.8591 ± 0.0338
# MCC: 0.7173 ± 0.0694
# AUC: 0.9189 ± 0.0279

Final 10-Fold CV Results:
Accuracy: 0.8369 ± 0.0171
Precision: 0.8045 ± 0.0372
Recall: 0.8952 ± 0.0358
F1-Score: 0.8461 ± 0.0131
MCC: 0.6811 ± 0.0302
AUC: 0.9184 ± 0.0104


In [101]:
# Initialize cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Store metrics
accuracy_scores, precision_scores, recall_scores, f1_scores, mcc_scores, auc_scores = [], [], [], [], [], []

# Perform 10-Fold Cross-Validation
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    probabilities = []
    max_scores = []
    
    # Train all models and get probabilities
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred_proba = model.predict_proba(X_test)[:, 1]  # Probability for class 1
        probabilities.append(y_pred_proba)
        max_scores.append(roc_auc_score(y_test, y_pred_proba))  # Use AUC as weight
    
    # Normalize AUC scores to get weights
    weights = [score / sum(max_scores) for score in max_scores]
    
    # Compute weighted average probabilities
    sigma_d = np.std(np.concatenate(probabilities))  # Compute standard deviation of probabilities
    combined_probs = np.zeros_like(probabilities[0])

    for weight, probs in zip(weights, probabilities):
        max_proba = np.max(probs)  # Maximum probability from this model
        if sigma_d < 0.25 and max_proba < 0.5:
            combined_probs += weight * (1 - probs)  # Invert probabilities
        else:
            combined_probs += weight * probs

    

    # Determine best threshold using precision-recall
    from sklearn.metrics import precision_recall_curve

    def get_best_threshold_precision_recall(y_true, y_proba):
        precision, recall, thresholds = precision_recall_curve(y_true, y_proba)
        # f1_scores = 2 * (precision * recall) / (precision + recall)
        # f1_scores = np.nan_to_num(f1_scores)
        f1_scores = np.where((precision + recall) == 0, 0, 2 * (precision * recall) / (precision + recall))
        best_threshold = thresholds[np.argmax(f1_scores)]
        return best_threshold

    best_threshold_pr = get_best_threshold_precision_recall(y_test, combined_probs)

    # Convert probabilities to binary predictions using threshold
    predictions = (combined_probs > best_threshold_pr).astype(int)

    # Compute performance metrics
    accuracy_scores.append(accuracy_score(y_test, predictions))
    precision_scores.append(precision_score(y_test, predictions))
    recall_scores.append(recall_score(y_test, predictions))
    f1_scores.append(f1_score(y_test, predictions))
    mcc_scores.append(matthews_corrcoef(y_test, predictions))
    auc_scores.append(roc_auc_score(y_test, combined_probs))

# Print final results (average over 10 folds)
print(f"Final 10-Fold CV Results:")
print(f"Accuracy: {np.mean(accuracy_scores):.4f} ± {np.std(accuracy_scores):.4f}")
print(f"Precision: {np.mean(precision_scores):.4f} ± {np.std(precision_scores):.4f}")
print(f"Recall: {np.mean(recall_scores):.4f} ± {np.std(recall_scores):.4f}")
print(f"F1-Score: {np.mean(f1_scores):.4f} ± {np.std(f1_scores):.4f}")
print(f"MCC: {np.mean(mcc_scores):.4f} ± {np.std(mcc_scores):.4f}")
print(f"AUC: {np.mean(auc_scores):.4f} ± {np.std(auc_scores):.4f}")


# Accuracy: 0.8540 ± 0.0396
# Precision: 0.8398 ± 0.0656
# Recall: 0.8876 ± 0.0656
# F1-Score: 0.8591 ± 0.0338
# MCC: 0.7173 ± 0.0694
# AUC: 0.9190 ± 0.0277

Final 10-Fold CV Results:
Accuracy: 0.8369 ± 0.0171
Precision: 0.8045 ± 0.0372
Recall: 0.8952 ± 0.0358
F1-Score: 0.8461 ± 0.0131
MCC: 0.6811 ± 0.0302
AUC: 0.9184 ± 0.0104


In [102]:
# Initialize cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Store metrics
accuracy_scores, precision_scores, recall_scores, f1_scores, mcc_scores, auc_scores = [], [], [], [], [], []

# Perform 10-Fold Cross-Validation
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    probabilities = []
    max_scores = []
    
    # Train all models and get probabilities
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred_proba = model.predict_proba(X_test)[:, 1]  # Probability for class 1
        probabilities.append(y_pred_proba)
        max_scores.append(roc_auc_score(y_test, y_pred_proba))  # Use AUC as weight
    
    # Normalize AUC scores to get weights
    weights = [score / sum(max_scores) for score in max_scores]

    combined_probs = np.sum([weight * probs for weight, probs in zip(weights, probabilities)], axis=0)


    thresholds = np.arange(0.4, 0.6, 0.01)
    best_f1 = 0
    best_threshold = 0.5
    for t in thresholds:
        preds = (combined_probs > t).astype(int)
        f1 = f1_score(y_test, preds)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = t
    print(f"Best threshold: {best_threshold}, Best F1: {best_f1:.4f}")

    # Convert probabilities to binary predictions using threshold
    predictions = (combined_probs > best_threshold).astype(int)

    # Compute performance metrics
    accuracy_scores.append(accuracy_score(y_test, predictions))
    precision_scores.append(precision_score(y_test, predictions))
    recall_scores.append(recall_score(y_test, predictions))
    f1_scores.append(f1_score(y_test, predictions))
    mcc_scores.append(matthews_corrcoef(y_test, predictions))
    auc_scores.append(roc_auc_score(y_test, combined_probs))

# Print final results (average over 10 folds)
print(f"Final 10-Fold CV Results:")
print(f"Accuracy: {np.mean(accuracy_scores):.4f} ± {np.std(accuracy_scores):.4f}")
print(f"Precision: {np.mean(precision_scores):.4f} ± {np.std(precision_scores):.4f}")
print(f"Recall: {np.mean(recall_scores):.4f} ± {np.std(recall_scores):.4f}")
print(f"F1-Score: {np.mean(f1_scores):.4f} ± {np.std(f1_scores):.4f}")
print(f"MCC: {np.mean(mcc_scores):.4f} ± {np.std(mcc_scores):.4f}")
print(f"AUC: {np.mean(auc_scores):.4f} ± {np.std(auc_scores):.4f}")


# Accuracy: 0.8670 ± 0.0388
# Precision: 0.8552 ± 0.0491
# Recall: 0.8873 ± 0.0595
# F1-Score: 0.8692 ± 0.0380
# MCC: 0.7379 ± 0.0782
# AUC: 0.9190 ± 0.0277

Best threshold: 0.4, Best F1: 0.8344
Best threshold: 0.5100000000000001, Best F1: 0.8389
Best threshold: 0.41000000000000003, Best F1: 0.8450
Best threshold: 0.5000000000000001, Best F1: 0.8186
Best threshold: 0.4800000000000001, Best F1: 0.8479
Best threshold: 0.4, Best F1: 0.8486
Best threshold: 0.5000000000000001, Best F1: 0.8645
Best threshold: 0.5100000000000001, Best F1: 0.8659
Best threshold: 0.5000000000000001, Best F1: 0.8585
Best threshold: 0.4700000000000001, Best F1: 0.8453
Final 10-Fold CV Results:
Accuracy: 0.8386 ± 0.0176
Precision: 0.8098 ± 0.0389
Recall: 0.8909 ± 0.0405
F1-Score: 0.8468 ± 0.0135
MCC: 0.6841 ± 0.0304
AUC: 0.9184 ± 0.0104


In [104]:
from sklearn.ensemble import VotingClassifier

# Create a Hard Voting Classifier
voting_clf_hard = VotingClassifier(
    estimators=[
        
        ("RandomForest", RandomForestClassifier(n_estimators=500, max_depth=None, min_samples_split=5, 
                                       min_samples_leaf=2, class_weight='balanced', random_state=42)),
        ("AdaBoostClassifier", AdaBoostClassifier(n_estimators=100, learning_rate=0.8, random_state=42)),  
        ("BaggingClassifier" , BaggingClassifier(n_estimators=100, max_samples=0.8, random_state=42)),
        ("KNeighborsClassifier" , KNeighborsClassifier(n_neighbors=3, weights='distance', metric='manhattan')),
        ("MLPClassifier" , MLPClassifier(activation='relu', hidden_layer_sizes=(200,100), max_iter=2000, 
                                learning_rate='adaptive', random_state=42)),
        # ("GradientBoostingClassifier" , GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=4, random_state=42)),
        ("HistGradientBoostingClassifier" , HistGradientBoostingClassifier(random_state=42)),   
        ("DecisionTreeClassifier" , DecisionTreeClassifier(max_depth=10, min_samples_split=10, min_samples_leaf=5, random_state=42)),
        ("SVC" , SVC(random_state=42, probability=True, C=10, kernel='poly', gamma='scale')),
        # ("GaussianNB" , GaussianNB(var_smoothing=1e-9)),
        # ("LogisticRegression" , LogisticRegression(class_weight='balanced', random_state=42)),
        ("XGBoost", XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=4, random_state=42)),
        ("CatBoost", CatBoostClassifier(iterations=200, learning_rate=0.05, depth=4, verbose=0, random_state=42))
    ],
    voting='hard'  # Specify hard voting, where the majority class prediction is chosen
)

from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import (
    make_scorer, precision_score, recall_score, f1_score, 
    matthews_corrcoef, roc_auc_score
)

# Define custom scorers for cross-validation
scoring = {
    'accuracy': 'accuracy',
    'precision': make_scorer(precision_score, average='binary'),  # For binary classification
    'recall': make_scorer(recall_score, average='binary'),
    'f1': make_scorer(f1_score, average='binary'),
    'mcc': make_scorer(matthews_corrcoef),
    'auc': make_scorer(roc_auc_score) # , needs_proba=True
}

# Helper function to display results
def print_cv_results(results):
    for metric in scoring.keys():
        mean = results[f'test_{metric}'].mean()
        std = results[f'test_{metric}'].std()
        print(f"{metric.capitalize()}: {mean:.4f} ± {std:.4f}")

def crossvalidate_fun(classifier, X_train, y_train):
    cv_results = cross_validate(classifier, X_train, y_train, cv=10, scoring=scoring)
    print_cv_results(cv_results)

crossvalidate_fun(voting_clf_hard,X, y)

Accuracy: 0.7630 ± 0.0435
Precision: 0.7543 ± 0.0302
Recall: 0.7801 ± 0.0946
F1: 0.7645 ± 0.0516
Mcc: 0.5306 ± 0.0913
Auc: 0.7630 ± 0.0435


In [105]:
# Create a Soft Voting Classifier
voting_clf_soft = VotingClassifier(
    estimators=[
        
        ("RandomForest", RandomForestClassifier(n_estimators=500, max_depth=None, min_samples_split=5, 
                                       min_samples_leaf=2, class_weight='balanced', random_state=42)),
        ("AdaBoostClassifier", AdaBoostClassifier(n_estimators=100, learning_rate=0.8, random_state=42)),  
        ("BaggingClassifier" , BaggingClassifier(n_estimators=100, max_samples=0.8, random_state=42)),
        ("KNeighborsClassifier" , KNeighborsClassifier(n_neighbors=3, weights='distance', metric='manhattan')),
        ("MLPClassifier" , MLPClassifier(activation='relu', hidden_layer_sizes=(200,100), max_iter=2000, 
                                learning_rate='adaptive', random_state=42)),
        # ("GradientBoostingClassifier" , GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=4, random_state=42)),
        ("HistGradientBoostingClassifier" , HistGradientBoostingClassifier(random_state=42)),   
        ("DecisionTreeClassifier" , DecisionTreeClassifier(max_depth=10, min_samples_split=10, min_samples_leaf=5, random_state=42)),
        ("SVC" , SVC(random_state=42, probability=True, C=10, kernel='poly', gamma='scale')),
        # ("GaussianNB" , GaussianNB(var_smoothing=1e-9)),
        # ("LogisticRegression" , LogisticRegression(class_weight='balanced', random_state=42)),
        ("XGBoost", XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=4, random_state=42)),
        ("CatBoost", CatBoostClassifier(iterations=200, learning_rate=0.05, depth=4, verbose=0, random_state=42))
    ],
    voting='soft'  # Specify soft voting, where class probabilities are combined
)

crossvalidate_fun(voting_clf_soft,X, y)

Accuracy: 0.7644 ± 0.0459
Precision: 0.7369 ± 0.0356
Recall: 0.8242 ± 0.0976
F1: 0.7755 ± 0.0509
Mcc: 0.5387 ± 0.1001
Auc: 0.7644 ± 0.0459
