In [None]:
############ GBDT ##############


import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score, roc_curve, log_loss
from sklearn.preprocessing import StandardScaler
import numpy as np
import sys

# Open a file to write the output
with open('GBDT_CV.txt', 'w') as f:
    # Redirect stdout to the file
    sys.stdout = f

    # Load the dataset
    df = pd.read_csv('Merge_all_features_1639_with_class.csv')

    # Split into features and target variable
    X = df.drop(columns=df.columns[-1])
    y = df[df.columns[-1]]

    # Scale the features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # Split data into training and testing sets (80:20)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize GradientBoostingClassifier
    clf_gbdt = GradientBoostingClassifier(random_state=42)

    # Set up KFold cross-validation with 10 folds
    kf = KFold(n_splits=10, shuffle=True, random_state=42)

    cv_scores = []
    cv_losses = []  # List to store losses for each fold
    fold = 1

    # Perform 10-fold cross-validation
    for train_index, val_index in kf.split(X_train):
        X_fold_train, X_fold_val = X_train[train_index], X_train[val_index]
        y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]
        
        clf_gbdt.fit(X_fold_train, y_fold_train)
        
        y_fold_pred = clf_gbdt.predict(X_fold_val)
        fold_accuracy = accuracy_score(y_fold_val, y_fold_pred)
        
        # Calculate the validation loss
        y_fold_pred_proba = clf_gbdt.predict_proba(X_fold_val)
        fold_loss = log_loss(y_fold_val, y_fold_pred_proba)
        
        cv_scores.append(fold_accuracy)
        cv_losses.append(fold_loss)
        
        print(f"Fold {fold} Validation Accuracy: {fold_accuracy*100:.2f}%, Validation Loss: {fold_loss:.4f}")
        fold += 1

    # Calculate the mean cross-validation accuracy and loss
    mean_cv_accuracy = np.mean(cv_scores)
    mean_cv_loss = np.mean(cv_losses)
    print(f"\nAverage Cross-Validation Accuracy: {mean_cv_accuracy*100:.2f}%")
    print(f"Average Cross-Validation Loss: {mean_cv_loss:.4f}")

    # Train the model on the entire training set
    clf_gbdt.fit(X_train, y_train)

    # Predict on the test set
    y_pred_gbdt = clf_gbdt.predict(X_test)
    y_pred_proba_gbdt = clf_gbdt.predict_proba(X_test)[:, 1]  # Get the probabilities for the positive class

    # Calculate the test set accuracy
    test_accuracy = accuracy_score(y_test, y_pred_gbdt)
    print(f"\nTest Set Accuracy: {test_accuracy*100:.2f}%")

    # Generate and print the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred_gbdt)
    print("\nConfusion Matrix:")
    print(conf_matrix)

    # Calculate and print additional metrics
    precision = precision_score(y_test, y_pred_gbdt)
    recall = recall_score(y_test, y_pred_gbdt)
    f1 = f1_score(y_test, y_pred_gbdt)
    mcc = matthews_corrcoef(y_test, y_pred_gbdt)
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba_gbdt)
    auc_score = roc_auc_score(y_test, y_pred_proba_gbdt)

    # Calculate specificity
    tn, fp, fn, tp = conf_matrix.ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    # Calculate false negative rate (FNR) and false positive rate (FPR)
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

    print(f"\nAccuracy: {test_accuracy*100:.2f}%")
    print(f"Precision: {precision:.4f}")
    print(f"Recall (Sensitivity): {recall:.4f}")
    print(f"Specificity: {specificity:.4f}")
    print(f"False Negative Rate (FNR): {fnr:.4f}")
    print(f"False Positive Rate (FPR): {fpr:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Matthews Correlation Coefficient (MCC): {mcc:.4f}")
    print(f"AUC Score: {auc_score:.4f}")

    # Reset stdout to default
    sys.stdout = sys.__stdout__

# Notify that results have been saved
print("Results have been saved to GBDT_CV.txt")
