In [1]:
########## RF ########

import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score, roc_curve
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
import numpy as np
import sys

# Open a file to write the output
with open('RF_CV.txt', 'w') as f:
    # Redirect stdout to the file
    sys.stdout = f

    # Load the dataset
    df = pd.read_csv('Merge_all_features_1639_with_class.csv')

    # Split into features and target variable
    X = df.drop(columns=df.columns[-1])
    y = df[df.columns[-1]]

    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Split data into training and testing sets (80:20)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Initialize RandomForestClassifier
    clf_rf = RandomForestClassifier(random_state=42, n_jobs=-1)

    # Set up KFold cross-validation with 10 folds
    kf = KFold(n_splits=10, shuffle=True, random_state=42)

    cv_scores = []
    cv_losses = []  # List to store losses for each fold
    fold = 1

    # Perform 10-fold cross-validation
    for train_index, val_index in kf.split(X_train):
        X_fold_train, X_fold_val = X_train[train_index], X_train[val_index]
        y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # Train the RandomForest model
        clf_rf.fit(X_fold_train, y_fold_train)
        
        y_fold_pred = clf_rf.predict(X_fold_val)
        fold_accuracy = accuracy_score(y_fold_val, y_fold_pred)
        
        # Calculate the validation loss
        y_fold_pred_proba = clf_rf.predict_proba(X_fold_val)
        fold_loss = log_loss(y_fold_val, y_fold_pred_proba)
        
        cv_scores.append(fold_accuracy)
        cv_losses.append(fold_loss)
        
        print(f"Fold {fold} Validation Accuracy: {fold_accuracy*100:.2f}%, Validation Loss: {fold_loss:.4f}")
        fold += 1

    # Calculate the mean cross-validation accuracy and loss
    mean_cv_accuracy = np.mean(cv_scores)
    mean_cv_loss = np.mean(cv_losses)
    print(f"\nAverage Cross-Validation Accuracy: {mean_cv_accuracy*100:.2f}%")
    print(f"Average Cross-Validation Loss: {mean_cv_loss:.4f}")

    # Train the model on the entire training set
    clf_rf.fit(X_train, y_train)

    # Predict on the test set
    y_pred_rf = clf_rf.predict(X_test)
    y_pred_proba_rf = clf_rf.predict_proba(X_test)[:, 1]  # Get the probabilities for the positive class

    # Calculate the test set accuracy
    test_accuracy = accuracy_score(y_test, y_pred_rf)
    print(f"\nTest Set Accuracy: {test_accuracy*100:.2f}%")

    # Generate and print the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred_rf)
    print("\nConfusion Matrix:")
    print(conf_matrix)

    # Calculate and print additional metrics
    precision = precision_score(y_test, y_pred_rf)
    recall = recall_score(y_test, y_pred_rf)
    f1 = f1_score(y_test, y_pred_rf)
    mcc = matthews_corrcoef(y_test, y_pred_rf)
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba_rf)
    auc_score = roc_auc_score(y_test, y_pred_proba_rf)

    # Calculate specificity
    tn, fp, fn, tp = conf_matrix.ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    # Calculate false negative rate (FNR) and false positive rate (FPR)
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

    print(f"\nAccuracy: {test_accuracy*100:.2f}%")
    print(f"Precision: {precision:.4f}")
    print(f"Recall (Sensitivity): {recall:.4f}")
    print(f"Specificity: {specificity:.4f}")
    print(f"False Negative Rate (FNR): {fnr:.4f}")
    print(f"False Positive Rate (FPR): {fpr:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Matthews Correlation Coefficient (MCC): {mcc:.4f}")
    print(f"AUC Score: {auc_score:.4f}")

    # Reset stdout to default
    sys.stdout = sys.__stdout__

# Notify that results have been saved
print("Results have been saved to RF_CV.txt")


In [None]:
########## SVM ########


import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score, roc_curve, log_loss
from sklearn.preprocessing import StandardScaler
import numpy as np
import sys

# Open a file to write the output
with open('SVC_CV.txt', 'w') as f:
    # Redirect stdout to the file
    sys.stdout = f

    # Load the dataset
    df = pd.read_csv('Merge_all_features_1639_with_class.csv')

    # Split into features and target variable
    X = df.drop(columns=df.columns[-1])
    y = df[df.columns[-1]]

    # Scale the features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # Split data into training and testing sets (80:20)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize SVC with probability estimates
    clf_svc = SVC(probability=True, random_state=42)

    # Set up KFold cross-validation with 10 folds
    kf = KFold(n_splits=10, shuffle=True, random_state=42)

    cv_scores = []
    cv_losses = []  # List to store losses for each fold
    fold = 1

    # Perform 10-fold cross-validation
    for train_index, val_index in kf.split(X_train):
        X_fold_train, X_fold_val = X_train[train_index], X_train[val_index]
        y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # Train the SVC model
        clf_svc.fit(X_fold_train, y_fold_train)
        
        y_fold_pred = clf_svc.predict(X_fold_val)
        fold_accuracy = accuracy_score(y_fold_val, y_fold_pred)
        
        # Calculate the validation loss
        y_fold_pred_proba = clf_svc.predict_proba(X_fold_val)
        fold_loss = log_loss(y_fold_val, y_fold_pred_proba)
        
        cv_scores.append(fold_accuracy)
        cv_losses.append(fold_loss)
        
        print(f"Fold {fold} Validation Accuracy: {fold_accuracy*100:.2f}%, Validation Loss: {fold_loss:.4f}")
        fold += 1

    # Calculate the mean cross-validation accuracy and loss
    mean_cv_accuracy = np.mean(cv_scores)
    mean_cv_loss = np.mean(cv_losses)
    print(f"\nAverage Cross-Validation Accuracy: {mean_cv_accuracy*100:.2f}%")
    print(f"Average Cross-Validation Loss: {mean_cv_loss:.4f}")

    # Train the model on the entire training set
    clf_svc.fit(X_train, y_train)

    # Predict on the test set
    y_pred_svc = clf_svc.predict(X_test)
    y_pred_proba_svc = clf_svc.predict_proba(X_test)[:, 1]  # Get the probabilities for the positive class

    # Calculate the test set accuracy
    test_accuracy = accuracy_score(y_test, y_pred_svc)
    print(f"\nTest Set Accuracy: {test_accuracy*100:.2f}%")

    # Generate and print the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred_svc)
    print("\nConfusion Matrix:")
    print(conf_matrix)

    # Calculate and print additional metrics
    precision = precision_score(y_test, y_pred_svc)
    recall = recall_score(y_test, y_pred_svc)
    f1 = f1_score(y_test, y_pred_svc)
    mcc = matthews_corrcoef(y_test, y_pred_svc)
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba_svc)
    auc_score = roc_auc_score(y_test, y_pred_proba_svc)

    # Calculate specificity
    tn, fp, fn, tp = conf_matrix.ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    # Calculate false negative rate (FNR) and false positive rate (FPR)
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    fpr_rate = fp / (fp + tn) if (fp + tn) > 0 else 0

    print(f"\nAccuracy: {test_accuracy*100:.2f}%")
    print(f"Precision: {precision:.4f}")
    print(f"Recall (Sensitivity): {recall:.4f}")
    print(f"Specificity: {specificity:.4f}")
    print(f"False Negative Rate (FNR): {fnr:.4f}")
    print(f"False Positive Rate (FPR): {fpr_rate:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Matthews Correlation Coefficient (MCC): {mcc:.4f}")
    print(f"AUC Score: {auc_score:.4f}")

    # Reset stdout to default
    sys.stdout = sys.__stdout__

# Notify that results have been saved
print("Results have been saved to SVC_CV.txt")


In [None]:
########## LR ###########

import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score, roc_curve, log_loss
from sklearn.preprocessing import StandardScaler
import numpy as np
import sys

# Open a file to write the output
with open('LR_CV.txt', 'w') as f:
    # Redirect stdout to the file
    sys.stdout = f

    # Load the dataset
    df = pd.read_csv('Merge_all_features_1639_with_class.csv')

    # Split into features and target variable
    X = df.drop(columns=df.columns[-1])
    y = df[df.columns[-1]]

    # Scale the features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # Split data into training and testing sets (80:20)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize LogisticRegression with probability estimates
    clf_lr = LogisticRegression(max_iter=1000, random_state=42)

    # Set up KFold cross-validation with 10 folds
    kf = KFold(n_splits=10, shuffle=True, random_state=42)

    cv_scores = []
    cv_losses = []  # List to store losses for each fold
    fold = 1

    # Perform 10-fold cross-validation
    for train_index, val_index in kf.split(X_train):
        X_fold_train, X_fold_val = X_train[train_index], X_train[val_index]
        y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]
        
        clf_lr.fit(X_fold_train, y_fold_train)
        
        y_fold_pred = clf_lr.predict(X_fold_val)
        fold_accuracy = accuracy_score(y_fold_val, y_fold_pred)
        
        # Calculate the validation loss
        y_fold_pred_proba = clf_lr.predict_proba(X_fold_val)
        fold_loss = log_loss(y_fold_val, y_fold_pred_proba)
        
        cv_scores.append(fold_accuracy)
        cv_losses.append(fold_loss)
        
        print(f"Fold {fold} Validation Accuracy: {fold_accuracy*100:.2f}%, Validation Loss: {fold_loss:.4f}")
        fold += 1

    # Calculate the mean cross-validation accuracy and loss
    mean_cv_accuracy = np.mean(cv_scores)
    mean_cv_loss = np.mean(cv_losses)
    print(f"\nAverage Cross-Validation Accuracy: {mean_cv_accuracy*100:.2f}%")
    print(f"Average Cross-Validation Loss: {mean_cv_loss:.4f}")

    # Train the model on the entire training set
    clf_lr.fit(X_train, y_train)

    # Predict on the test set
    y_pred_lr = clf_lr.predict(X_test)
    y_pred_proba_lr = clf_lr.predict_proba(X_test)[:, 1]  # Get the probabilities for the positive class

    # Calculate the test set accuracy
    test_accuracy = accuracy_score(y_test, y_pred_lr)
    print(f"\nTest Set Accuracy: {test_accuracy*100:.2f}%")

    # Generate and print the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred_lr)
    print("\nConfusion Matrix:")
    print(conf_matrix)

    # Calculate and print additional metrics
    precision = precision_score(y_test, y_pred_lr)
    recall = recall_score(y_test, y_pred_lr)
    f1 = f1_score(y_test, y_pred_lr)
    mcc = matthews_corrcoef(y_test, y_pred_lr)
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba_lr)
    auc_score = roc_auc_score(y_test, y_pred_proba_lr)

    # Calculate specificity
    tn, fp, fn, tp = conf_matrix.ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    # Calculate false negative rate (FNR) and false positive rate (FPR)
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

    print(f"\nAccuracy: {test_accuracy*100:.2f}%")
    print(f"Precision: {precision:.4f}")
    print(f"Recall (Sensitivity): {recall:.4f}")
    print(f"Specificity: {specificity:.4f}")
    print(f"False Negative Rate (FNR): {fnr:.4f}")
    print(f"False Positive Rate (FPR): {fpr:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Matthews Correlation Coefficient (MCC): {mcc:.4f}")
    print(f"AUC Score: {auc_score:.4f}")

    # Reset stdout to default
    sys.stdout = sys.__stdout__

# Notify that results have been saved
print("Results have been saved to LR_CV.txt")


In [None]:
########## KNN ###########

import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score, roc_curve, log_loss
from sklearn.preprocessing import StandardScaler
import numpy as np
import sys

# Open a file to write the output
with open('KNN_CV.txt', 'w') as f:
    # Redirect stdout to the file
    sys.stdout = f

    # Load the dataset
    df = pd.read_csv('Merge_all_features_1639_with_class.csv')

    # Split into features and target variable
    X = df.drop(columns=df.columns[-1])
    y = df[df.columns[-1]]

    # Scale the features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # Split data into training and testing sets (80:20)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize KNeighborsClassifier with a specific number of neighbors
    clf_knn = KNeighborsClassifier(n_neighbors=5)

    # Set up KFold cross-validation with 10 folds
    kf = KFold(n_splits=10, shuffle=True, random_state=42)

    cv_scores = []
    cv_losses = []  # List to store losses for each fold
    fold = 1

    # Perform 10-fold cross-validation
    for train_index, val_index in kf.split(X_train):
        X_fold_train, X_fold_val = X_train[train_index], X_train[val_index]
        y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]
        
        clf_knn.fit(X_fold_train, y_fold_train)
        
        y_fold_pred = clf_knn.predict(X_fold_val)
        fold_accuracy = accuracy_score(y_fold_val, y_fold_pred)
        
        # Calculate the validation loss
        y_fold_pred_proba = clf_knn.predict_proba(X_fold_val)
        fold_loss = log_loss(y_fold_val, y_fold_pred_proba)
        
        cv_scores.append(fold_accuracy)
        cv_losses.append(fold_loss)
        
        print(f"Fold {fold} Validation Accuracy: {fold_accuracy*100:.2f}%, Validation Loss: {fold_loss:.4f}")
        fold += 1

    # Calculate the mean cross-validation accuracy and loss
    mean_cv_accuracy = np.mean(cv_scores)
    mean_cv_loss = np.mean(cv_losses)
    print(f"\nAverage Cross-Validation Accuracy: {mean_cv_accuracy*100:.2f}%")
    print(f"Average Cross-Validation Loss: {mean_cv_loss:.4f}")

    # Train the model on the entire training set
    clf_knn.fit(X_train, y_train)

    # Predict on the test set
    y_pred_knn = clf_knn.predict(X_test)
    y_pred_proba_knn = clf_knn.predict_proba(X_test)[:, 1]  # Get the probabilities for the positive class

    # Calculate the test set accuracy
    test_accuracy = accuracy_score(y_test, y_pred_knn)
    print(f"\nTest Set Accuracy: {test_accuracy*100:.2f}%")

    # Generate and print the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred_knn)
    print("\nConfusion Matrix:")
    print(conf_matrix)

    # Calculate and print additional metrics
    precision = precision_score(y_test, y_pred_knn)
    recall = recall_score(y_test, y_pred_knn)
    f1 = f1_score(y_test, y_pred_knn)
    mcc = matthews_corrcoef(y_test, y_pred_knn)
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba_knn)
    auc_score = roc_auc_score(y_test, y_pred_proba_knn)

    # Calculate specificity
    tn, fp, fn, tp = conf_matrix.ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    # Calculate false negative rate (FNR) and false positive rate (FPR)
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

    print(f"\nAccuracy: {test_accuracy*100:.2f}%")
    print(f"Precision: {precision:.4f}")
    print(f"Recall (Sensitivity): {recall:.4f}")
    print(f"Specificity: {specificity:.4f}")
    print(f"False Negative Rate (FNR): {fnr:.4f}")
    print(f"False Positive Rate (FPR): {fpr:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Matthews Correlation Coefficient (MCC): {mcc:.4f}")
    print(f"AUC Score: {auc_score:.4f}")

    # Reset stdout to default
    sys.stdout = sys.__stdout__

# Notify that results have been saved
print("Results have been saved to KNN_CV.txt")


In [None]:
############### NB ################


import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score, roc_curve, log_loss
from sklearn.preprocessing import StandardScaler
import numpy as np
import sys

# Open a file to write the output
with open('NB_CV.txt', 'w') as f:
    # Redirect stdout to the file
    sys.stdout = f

    # Load the dataset
    df = pd.read_csv('Merge_all_features_1639_with_class.csv')

    # Split into features and target variable
    X = df.drop(columns=df.columns[-1])
    y = df[df.columns[-1]]

    # Scale the features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # Split data into training and testing sets (80:20)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize GaussianNB
    clf_nb = GaussianNB()

    # Set up KFold cross-validation with 10 folds
    kf = KFold(n_splits=10, shuffle=True, random_state=42)

    cv_scores = []
    cv_losses = []  # List to store losses for each fold
    fold = 1

    # Perform 10-fold cross-validation
    for train_index, val_index in kf.split(X_train):
        X_fold_train, X_fold_val = X_train[train_index], X_train[val_index]
        y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]
        
        clf_nb.fit(X_fold_train, y_fold_train)
        
        y_fold_pred = clf_nb.predict(X_fold_val)
        fold_accuracy = accuracy_score(y_fold_val, y_fold_pred)
        
        # Calculate the validation loss
        y_fold_pred_proba = clf_nb.predict_proba(X_fold_val)
        fold_loss = log_loss(y_fold_val, y_fold_pred_proba)
        
        cv_scores.append(fold_accuracy)
        cv_losses.append(fold_loss)
        
        print(f"Fold {fold} Validation Accuracy: {fold_accuracy*100:.2f}%, Validation Loss: {fold_loss:.4f}")
        fold += 1

    # Calculate the mean cross-validation accuracy and loss
    mean_cv_accuracy = np.mean(cv_scores)
    mean_cv_loss = np.mean(cv_losses)
    print(f"\nAverage Cross-Validation Accuracy: {mean_cv_accuracy*100:.2f}%")
    print(f"Average Cross-Validation Loss: {mean_cv_loss:.4f}")

    # Train the model on the entire training set
    clf_nb.fit(X_train, y_train)

    # Predict on the test set
    y_pred_nb = clf_nb.predict(X_test)
    y_pred_proba_nb = clf_nb.predict_proba(X_test)[:, 1]  # Get the probabilities for the positive class

    # Calculate the test set accuracy
    test_accuracy = accuracy_score(y_test, y_pred_nb)
    print(f"\nTest Set Accuracy: {test_accuracy*100:.2f}%")

    # Generate and print the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred_nb)
    print("\nConfusion Matrix:")
    print(conf_matrix)

    # Calculate and print additional metrics
    precision = precision_score(y_test, y_pred_nb)
    recall = recall_score(y_test, y_pred_nb)
    f1 = f1_score(y_test, y_pred_nb)
    mcc = matthews_corrcoef(y_test, y_pred_nb)
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba_nb)
    auc_score = roc_auc_score(y_test, y_pred_proba_nb)

    # Calculate specificity
    tn, fp, fn, tp = conf_matrix.ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    # Calculate false negative rate (FNR) and false positive rate (FPR)
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

    print(f"\nAccuracy: {test_accuracy*100:.2f}%")
    print(f"Precision: {precision:.4f}")
    print(f"Recall (Sensitivity): {recall:.4f}")
    print(f"Specificity: {specificity:.4f}")
    print(f"False Negative Rate (FNR): {fnr:.4f}")
    print(f"False Positive Rate (FPR): {fpr:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Matthews Correlation Coefficient (MCC): {mcc:.4f}")
    print(f"AUC Score: {auc_score:.4f}")

    # Reset stdout to default
    sys.stdout = sys.__stdout__

# Notify that results have been saved
print("Results have been saved to NB_CV.txt")


In [None]:
################# DT #############


import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score, roc_curve, log_loss
from sklearn.preprocessing import StandardScaler
import numpy as np
import sys

# Open a file to write the output
with open('DT_CV.txt', 'w') as f:
    # Redirect stdout to the file
    sys.stdout = f

    # Load the dataset
    df = pd.read_csv('Merge_all_features_1639_with_class.csv')

    # Split into features and target variable
    X = df.drop(columns=df.columns[-1])
    y = df[df.columns[-1]]

    # Scale the features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # Split data into training and testing sets (80:20)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize DecisionTreeClassifier
    clf_dt = DecisionTreeClassifier(random_state=42)

    # Set up KFold cross-validation with 10 folds
    kf = KFold(n_splits=10, shuffle=True, random_state=42)

    cv_scores = []
    cv_losses = []  # List to store losses for each fold
    fold = 1

    # Perform 10-fold cross-validation
    for train_index, val_index in kf.split(X_train):
        X_fold_train, X_fold_val = X_train[train_index], X_train[val_index]
        y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]
        
        clf_dt.fit(X_fold_train, y_fold_train)
        
        y_fold_pred = clf_dt.predict(X_fold_val)
        fold_accuracy = accuracy_score(y_fold_val, y_fold_pred)
        
        # Calculate the validation loss
        y_fold_pred_proba = clf_dt.predict_proba(X_fold_val)
        fold_loss = log_loss(y_fold_val, y_fold_pred_proba)
        
        cv_scores.append(fold_accuracy)
        cv_losses.append(fold_loss)
        
        print(f"Fold {fold} Validation Accuracy: {fold_accuracy*100:.2f}%, Validation Loss: {fold_loss:.4f}")
        fold += 1

    # Calculate the mean cross-validation accuracy and loss
    mean_cv_accuracy = np.mean(cv_scores)
    mean_cv_loss = np.mean(cv_losses)
    print(f"\nAverage Cross-Validation Accuracy: {mean_cv_accuracy*100:.2f}%")
    print(f"Average Cross-Validation Loss: {mean_cv_loss:.4f}")

    # Train the model on the entire training set
    clf_dt.fit(X_train, y_train)

    # Predict on the test set
    y_pred_dt = clf_dt.predict(X_test)
    y_pred_proba_dt = clf_dt.predict_proba(X_test)[:, 1]  # Get the probabilities for the positive class

    # Calculate the test set accuracy
    test_accuracy = accuracy_score(y_test, y_pred_dt)
    print(f"\nTest Set Accuracy: {test_accuracy*100:.2f}%")

    # Generate and print the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred_dt)
    print("\nConfusion Matrix:")
    print(conf_matrix)

    # Calculate and print additional metrics
    precision = precision_score(y_test, y_pred_dt)
    recall = recall_score(y_test, y_pred_dt)
    f1 = f1_score(y_test, y_pred_dt)
    mcc = matthews_corrcoef(y_test, y_pred_dt)
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba_dt)
    auc_score = roc_auc_score(y_test, y_pred_proba_dt)

    # Calculate specificity
    tn, fp, fn, tp = conf_matrix.ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    # Calculate false negative rate (FNR) and false positive rate (FPR)
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

    print(f"\nAccuracy: {test_accuracy*100:.2f}%")
    print(f"Precision: {precision:.4f}")
    print(f"Recall (Sensitivity): {recall:.4f}")
    print(f"Specificity: {specificity:.4f}")
    print(f"False Negative Rate (FNR): {fnr:.4f}")
    print(f"False Positive Rate (FPR): {fpr:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Matthews Correlation Coefficient (MCC): {mcc:.4f}")
    print(f"AUC Score: {auc_score:.4f}")

    # Reset stdout to default
    sys.stdout = sys.__stdout__

# Notify that results have been saved
print("Results have been saved to DT_CV.txt")


In [None]:
############ GBDT ##############


import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score, roc_curve, log_loss
from sklearn.preprocessing import StandardScaler
import numpy as np
import sys

# Open a file to write the output
with open('GBDT_CV.txt', 'w') as f:
    # Redirect stdout to the file
    sys.stdout = f

    # Load the dataset
    df = pd.read_csv('Merge_all_features_1639_with_class.csv')

    # Split into features and target variable
    X = df.drop(columns=df.columns[-1])
    y = df[df.columns[-1]]

    # Scale the features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # Split data into training and testing sets (80:20)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize GradientBoostingClassifier
    clf_gbdt = GradientBoostingClassifier(random_state=42)

    # Set up KFold cross-validation with 10 folds
    kf = KFold(n_splits=10, shuffle=True, random_state=42)

    cv_scores = []
    cv_losses = []  # List to store losses for each fold
    fold = 1

    # Perform 10-fold cross-validation
    for train_index, val_index in kf.split(X_train):
        X_fold_train, X_fold_val = X_train[train_index], X_train[val_index]
        y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]
        
        clf_gbdt.fit(X_fold_train, y_fold_train)
        
        y_fold_pred = clf_gbdt.predict(X_fold_val)
        fold_accuracy = accuracy_score(y_fold_val, y_fold_pred)
        
        # Calculate the validation loss
        y_fold_pred_proba = clf_gbdt.predict_proba(X_fold_val)
        fold_loss = log_loss(y_fold_val, y_fold_pred_proba)
        
        cv_scores.append(fold_accuracy)
        cv_losses.append(fold_loss)
        
        print(f"Fold {fold} Validation Accuracy: {fold_accuracy*100:.2f}%, Validation Loss: {fold_loss:.4f}")
        fold += 1

    # Calculate the mean cross-validation accuracy and loss
    mean_cv_accuracy = np.mean(cv_scores)
    mean_cv_loss = np.mean(cv_losses)
    print(f"\nAverage Cross-Validation Accuracy: {mean_cv_accuracy*100:.2f}%")
    print(f"Average Cross-Validation Loss: {mean_cv_loss:.4f}")

    # Train the model on the entire training set
    clf_gbdt.fit(X_train, y_train)

    # Predict on the test set
    y_pred_gbdt = clf_gbdt.predict(X_test)
    y_pred_proba_gbdt = clf_gbdt.predict_proba(X_test)[:, 1]  # Get the probabilities for the positive class

    # Calculate the test set accuracy
    test_accuracy = accuracy_score(y_test, y_pred_gbdt)
    print(f"\nTest Set Accuracy: {test_accuracy*100:.2f}%")

    # Generate and print the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred_gbdt)
    print("\nConfusion Matrix:")
    print(conf_matrix)

    # Calculate and print additional metrics
    precision = precision_score(y_test, y_pred_gbdt)
    recall = recall_score(y_test, y_pred_gbdt)
    f1 = f1_score(y_test, y_pred_gbdt)
    mcc = matthews_corrcoef(y_test, y_pred_gbdt)
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba_gbdt)
    auc_score = roc_auc_score(y_test, y_pred_proba_gbdt)

    # Calculate specificity
    tn, fp, fn, tp = conf_matrix.ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    # Calculate false negative rate (FNR) and false positive rate (FPR)
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

    print(f"\nAccuracy: {test_accuracy*100:.2f}%")
    print(f"Precision: {precision:.4f}")
    print(f"Recall (Sensitivity): {recall:.4f}")
    print(f"Specificity: {specificity:.4f}")
    print(f"False Negative Rate (FNR): {fnr:.4f}")
    print(f"False Positive Rate (FPR): {fpr:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Matthews Correlation Coefficient (MCC): {mcc:.4f}")
    print(f"AUC Score: {auc_score:.4f}")

    # Reset stdout to default
    sys.stdout = sys.__stdout__

# Notify that results have been saved
print("Results have been saved to GBDT_CV.txt")


In [1]:
###################### LGBM ###############
##### pip install lightgbm (python package)

import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score, roc_curve, log_loss
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
import numpy as np
import sys

# Open a file to write the output
with open('LGBM_CV.txt', 'w') as f:
    # Redirect stdout to the file
    sys.stdout = f

    # Load the dataset
    df = pd.read_csv('Merge_all_features_1639_with_class.csv')

    # Split into features and target variable
    X = df.drop(columns=df.columns[-1])
    y = df[df.columns[-1]]

    # Scale the features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # Split data into training and testing sets (80:20)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize LGBMClassifier
    clf_lgbm = lgb.LGBMClassifier(random_state=42)

    # Set up KFold cross-validation with 10 folds
    kf = KFold(n_splits=10, shuffle=True, random_state=42)

    cv_scores = []
    cv_losses = []  # List to store losses for each fold
    fold = 1

    # Perform 10-fold cross-validation
    for train_index, val_index in kf.split(X_train):
        X_fold_train, X_fold_val = X_train[train_index], X_train[val_index]
        y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]
        
        clf_lgbm.fit(X_fold_train, y_fold_train)
        
        y_fold_pred = clf_lgbm.predict(X_fold_val)
        fold_accuracy = accuracy_score(y_fold_val, y_fold_pred)
        
        # Calculate the validation loss
        y_fold_pred_proba = clf_lgbm.predict_proba(X_fold_val)
        fold_loss = log_loss(y_fold_val, y_fold_pred_proba)
        
        cv_scores.append(fold_accuracy)
        cv_losses.append(fold_loss)
        
        print(f"Fold {fold} Validation Accuracy: {fold_accuracy*100:.2f}%, Validation Loss: {fold_loss:.4f}")
        fold += 1

    # Calculate the mean cross-validation accuracy and loss
    mean_cv_accuracy = np.mean(cv_scores)
    mean_cv_loss = np.mean(cv_losses)
    print(f"\nAverage Cross-Validation Accuracy: {mean_cv_accuracy*100:.2f}%")
    print(f"Average Cross-Validation Loss: {mean_cv_loss:.4f}")

    # Train the model on the entire training set
    clf_lgbm.fit(X_train, y_train)

    # Predict on the test set
    y_pred_lgbm = clf_lgbm.predict(X_test)
    y_pred_proba_lgbm = clf_lgbm.predict_proba(X_test)[:, 1]  # Get the probabilities for the positive class

    # Calculate the test set accuracy
    test_accuracy = accuracy_score(y_test, y_pred_lgbm)
    print(f"\nTest Set Accuracy: {test_accuracy*100:.2f}%")

    # Generate and print the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred_lgbm)
    print("\nConfusion Matrix:")
    print(conf_matrix)

    # Calculate and print additional metrics
    precision = precision_score(y_test, y_pred_lgbm)
    recall = recall_score(y_test, y_pred_lgbm)
    f1 = f1_score(y_test, y_pred_lgbm)
    mcc = matthews_corrcoef(y_test, y_pred_lgbm)
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba_lgbm)
    auc_score = roc_auc_score(y_test, y_pred_proba_lgbm)

    # Calculate specificity
    tn, fp, fn, tp = conf_matrix.ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    # Calculate false negative rate (FNR) and false positive rate (FPR)
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

    print(f"\nAccuracy: {test_accuracy*100:.2f}%")
    print(f"Precision: {precision:.4f}")
    print(f"Recall (Sensitivity): {recall:.4f}")
    print(f"Specificity: {specificity:.4f}")
    print(f"False Negative Rate (FNR): {fnr:.4f}")
    print(f"False Positive Rate (FPR): {fpr:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Matthews Correlation Coefficient (MCC): {mcc:.4f}")
    print(f"AUC Score: {auc_score:.4f}")

    # Reset stdout to default
    sys.stdout = sys.__stdout__

# Notify that results have been saved
print("Results have been saved to LGBM_CV.txt")


In [2]:
############## XGBoost ############
##### pip install xgboost (python package)


import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score, roc_curve, log_loss
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import numpy as np
import sys

# Open a file to write the output
with open('XGB_CV.txt', 'w') as f:
    # Redirect stdout to the file
    sys.stdout = f

    # Load the dataset
    df = pd.read_csv('Merge_all_features_1639_with_class.csv')

    # Split into features and target variable
    X = df.drop(columns=df.columns[-1])
    y = df[df.columns[-1]]

    # Scale the features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # Split data into training and testing sets (80:20)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize XGBClassifier
    clf_xgb = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

    # Set up KFold cross-validation with 10 folds
    kf = KFold(n_splits=10, shuffle=True, random_state=42)

    cv_scores = []
    cv_losses = []  # List to store losses for each fold
    fold = 1

    # Perform 10-fold cross-validation
    for train_index, val_index in kf.split(X_train):
        X_fold_train, X_fold_val = X_train[train_index], X_train[val_index]
        y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]
        
        clf_xgb.fit(X_fold_train, y_fold_train)
        
        y_fold_pred = clf_xgb.predict(X_fold_val)
        fold_accuracy = accuracy_score(y_fold_val, y_fold_pred)
        
        # Calculate the validation loss
        y_fold_pred_proba = clf_xgb.predict_proba(X_fold_val)
        fold_loss = log_loss(y_fold_val, y_fold_pred_proba)
        
        cv_scores.append(fold_accuracy)
        cv_losses.append(fold_loss)
        
        print(f"Fold {fold} Validation Accuracy: {fold_accuracy*100:.2f}%, Validation Loss: {fold_loss:.4f}")
        fold += 1

    # Calculate the mean cross-validation accuracy and loss
    mean_cv_accuracy = np.mean(cv_scores)
    mean_cv_loss = np.mean(cv_losses)
    print(f"\nAverage Cross-Validation Accuracy: {mean_cv_accuracy*100:.2f}%")
    print(f"Average Cross-Validation Loss: {mean_cv_loss:.4f}")

    # Train the model on the entire training set
    clf_xgb.fit(X_train, y_train)

    # Predict on the test set
    y_pred_xgb = clf_xgb.predict(X_test)
    y_pred_proba_xgb = clf_xgb.predict_proba(X_test)[:, 1]  # Get the probabilities for the positive class

    # Calculate the test set accuracy
    test_accuracy = accuracy_score(y_test, y_pred_xgb)
    print(f"\nTest Set Accuracy: {test_accuracy*100:.2f}%")

    # Generate and print the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred_xgb)
    print("\nConfusion Matrix:")
    print(conf_matrix)

    # Calculate and print additional metrics
    precision = precision_score(y_test, y_pred_xgb)
    recall = recall_score(y_test, y_pred_xgb)
    f1 = f1_score(y_test, y_pred_xgb)
    mcc = matthews_corrcoef(y_test, y_pred_xgb)
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba_xgb)
    auc_score = roc_auc_score(y_test, y_pred_proba_xgb)

    # Calculate specificity
    tn, fp, fn, tp = conf_matrix.ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    # Calculate false negative rate (FNR) and false positive rate (FPR)
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

    print(f"\nAccuracy: {test_accuracy*100:.2f}%")
    print(f"Precision: {precision:.4f}")
    print(f"Recall (Sensitivity): {recall:.4f}")
    print(f"Specificity: {specificity:.4f}")
    print(f"False Negative Rate (FNR): {fnr:.4f}")
    print(f"False Positive Rate (FPR): {fpr:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Matthews Correlation Coefficient (MCC): {mcc:.4f}")
    print(f"AUC Score: {auc_score:.4f}")

    # Reset stdout to default
    sys.stdout = sys.__stdout__

# Notify that results have been saved
print("Results have been saved to XGV_CV.txt")


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



In [3]:
############## CatBoost ###########
#### pip install catboost (python package)


import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score, roc_curve, log_loss
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
import numpy as np
import sys

# Open a file to write the output
with open('CAT_CV.txt', 'w') as f:
    # Redirect stdout to the file
    sys.stdout = f

    # Load the dataset
    df = pd.read_csv('Merge_all_features_1639_with_class.csv')

    # Split into features and target variable
    X = df.drop(columns=df.columns[-1])
    y = df[df.columns[-1]]

    # Scale the features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # Split data into training and testing sets (80:20)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize CatBoostClassifier
    clf_cat = CatBoostClassifier(random_state=42, verbose=0)

    # Set up KFold cross-validation with 10 folds
    kf = KFold(n_splits=10, shuffle=True, random_state=42)

    cv_scores = []
    cv_losses = []  # List to store losses for each fold
    fold = 1

    # Perform 10-fold cross-validation
    for train_index, val_index in kf.split(X_train):
        X_fold_train, X_fold_val = X_train[train_index], X_train[val_index]
        y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]
        
        clf_cat.fit(X_fold_train, y_fold_train)
        
        y_fold_pred = clf_cat.predict(X_fold_val)
        fold_accuracy = accuracy_score(y_fold_val, y_fold_pred)
        
        # Calculate the validation loss
        y_fold_pred_proba = clf_cat.predict_proba(X_fold_val)
        fold_loss = log_loss(y_fold_val, y_fold_pred_proba)
        
        cv_scores.append(fold_accuracy)
        cv_losses.append(fold_loss)
        
        print(f"Fold {fold} Validation Accuracy: {fold_accuracy*100:.2f}%, Validation Loss: {fold_loss:.4f}")
        fold += 1

    # Calculate the mean cross-validation accuracy and loss
    mean_cv_accuracy = np.mean(cv_scores)
    mean_cv_loss = np.mean(cv_losses)
    print(f"\nAverage Cross-Validation Accuracy: {mean_cv_accuracy*100:.2f}%")
    print(f"Average Cross-Validation Loss: {mean_cv_loss:.4f}")

    # Train the model on the entire training set
    clf_cat.fit(X_train, y_train)

    # Predict on the test set
    y_pred_cat = clf_cat.predict(X_test)
    y_pred_proba_cat = clf_cat.predict_proba(X_test)[:, 1]  # Get the probabilities for the positive class

    # Calculate the test set accuracy
    test_accuracy = accuracy_score(y_test, y_pred_cat)
    print(f"\nTest Set Accuracy: {test_accuracy*100:.2f}%")

    # Generate and print the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred_cat)
    print("\nConfusion Matrix:")
    print(conf_matrix)

    # Calculate and print additional metrics
    precision = precision_score(y_test, y_pred_cat)
    recall = recall_score(y_test, y_pred_cat)
    f1 = f1_score(y_test, y_pred_cat)
    mcc = matthews_corrcoef(y_test, y_pred_cat)
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba_cat)
    auc_score = roc_auc_score(y_test, y_pred_proba_cat)

    # Calculate specificity
    tn, fp, fn, tp = conf_matrix.ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    # Calculate false negative rate (FNR) and false positive rate (FPR)
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

    print(f"\nAccuracy: {test_accuracy*100:.2f}%")
    print(f"Precision: {precision:.4f}")
    print(f"Recall (Sensitivity): {recall:.4f}")
    print(f"Specificity: {specificity:.4f}")
    print(f"False Negative Rate (FNR): {fnr:.4f}")
    print(f"False Positive Rate (FPR): {fpr:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Matthews Correlation Coefficient (MCC): {mcc:.4f}")
    print(f"AUC Score: {auc_score:.4f}")

    # Reset stdout to default
    sys.stdout = sys.__stdout__

# Notify that results have been saved
print("Results have been saved to CAT_CV.txt")


In [None]:
########## AdaBoost ###########

import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score, roc_curve, log_loss
from sklearn.preprocessing import StandardScaler
import numpy as np
import sys

# Open a file to write the output
with open('ADA_CV.txt', 'w') as f:
    # Redirect stdout to the file
    sys.stdout = f

    # Load the dataset
    df = pd.read_csv('Merge_all_features_1639_with_class.csv')

    # Split into features and target variable
    X = df.drop(columns=df.columns[-1])
    y = df[df.columns[-1]]

    # Scale the features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # Split data into training and testing sets (80:20)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize AdaBoostClassifier
    clf_ada = AdaBoostClassifier(random_state=42)

    # Set up KFold cross-validation with 10 folds
    kf = KFold(n_splits=10, shuffle=True, random_state=42)

    cv_scores = []
    cv_losses = []  # List to store losses for each fold
    fold = 1

    # Perform 10-fold cross-validation
    for train_index, val_index in kf.split(X_train):
        X_fold_train, X_fold_val = X_train[train_index], X_train[val_index]
        y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]
        
        clf_ada.fit(X_fold_train, y_fold_train)
        
        y_fold_pred = clf_ada.predict(X_fold_val)
        fold_accuracy = accuracy_score(y_fold_val, y_fold_pred)
        
        # Calculate the validation loss
        y_fold_pred_proba = clf_ada.predict_proba(X_fold_val)
        fold_loss = log_loss(y_fold_val, y_fold_pred_proba)
        
        cv_scores.append(fold_accuracy)
        cv_losses.append(fold_loss)
        
        print(f"Fold {fold} Validation Accuracy: {fold_accuracy*100:.2f}%, Validation Loss: {fold_loss:.4f}")
        fold += 1

    # Calculate the mean cross-validation accuracy and loss
    mean_cv_accuracy = np.mean(cv_scores)
    mean_cv_loss = np.mean(cv_losses)
    print(f"\nAverage Cross-Validation Accuracy: {mean_cv_accuracy*100:.2f}%")
    print(f"Average Cross-Validation Loss: {mean_cv_loss:.4f}")

    # Train the model on the entire training set
    clf_ada.fit(X_train, y_train)

    # Predict on the test set
    y_pred_ada = clf_ada.predict(X_test)
    y_pred_proba_ada = clf_ada.predict_proba(X_test)[:, 1]  # Get the probabilities for the positive class

    # Calculate the test set accuracy
    test_accuracy = accuracy_score(y_test, y_pred_ada)
    print(f"\nTest Set Accuracy: {test_accuracy*100:.2f}%")

    # Generate and print the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred_ada)
    print("\nConfusion Matrix:")
    print(conf_matrix)

    # Calculate and print additional metrics
    precision = precision_score(y_test, y_pred_ada)
    recall = recall_score(y_test, y_pred_ada)
    f1 = f1_score(y_test, y_pred_ada)
    mcc = matthews_corrcoef(y_test, y_pred_ada)
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba_ada)
    auc_score = roc_auc_score(y_test, y_pred_proba_ada)

    # Calculate specificity
    tn, fp, fn, tp = conf_matrix.ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    # Calculate false negative rate (FNR) and false positive rate (FPR)
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

    print(f"\nAccuracy: {test_accuracy*100:.2f}%")
    print(f"Precision: {precision:.4f}")
    print(f"Recall (Sensitivity): {recall:.4f}")
    print(f"Specificity: {specificity:.4f}")
    print(f"False Negative Rate (FNR): {fnr:.4f}")
    print(f"False Positive Rate (FPR): {fpr:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Matthews Correlation Coefficient (MCC): {mcc:.4f}")
    print(f"AUC Score: {auc_score:.4f}")

    # Reset stdout to default
    sys.stdout = sys.__stdout__

# Notify that results have been saved
print("Results have been saved to ADA_CV.txt")




In [None]:
########## ANN ##########

import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam

# Load dataset
df = pd.read_csv('Merge_all_features_1639_with_class.csv')
X = df.drop(columns=df.columns[-1])
y = df[df.columns[-1]]

# Scale features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split into training and testing sets (80:20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Simplified ANN model with an additional Dense layer of 64 units
def create_ann_model(input_dim):
    model = Sequential()
    model.add(Dense(64, input_dim=input_dim, activation='relu'))  # Added Dense layer with 64 units
    model.add(Dense(32, activation='relu'))  # Original 32 units layer
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Initialize the ANN model
ann_model = create_ann_model(X_train.shape[1])

# KFold cross-validation with 10 folds
kf = KFold(n_splits=10, shuffle=True, random_state=42)

cv_scores = []
cv_losses = []  # To store validation losses
fold = 1

for train_index, val_index in kf.split(X_train):
    X_fold_train, X_fold_val = X_train[train_index], X_train[val_index]
    y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Fit the model and store the history to get loss
    history = ann_model.fit(X_fold_train, y_fold_train, epochs=5, batch_size=16, verbose=0, validation_data=(X_fold_val, y_fold_val))
    
    y_fold_pred_proba = ann_model.predict(X_fold_val)
    y_fold_pred = (y_fold_pred_proba > 0.5).astype(int).flatten()
    fold_accuracy = accuracy_score(y_fold_val, y_fold_pred)
    
    # Get the validation loss from the history
    fold_loss = history.history['val_loss'][-1]  # Get the last recorded validation loss
    
    cv_scores.append(fold_accuracy)
    cv_losses.append(fold_loss)

    print(f"Fold {fold} - Validation Accuracy: {fold_accuracy*100:.2f}%, Validation Loss: {fold_loss:.4f}")
    fold += 1

# Calculate average cross-validation accuracy and loss
mean_cv_accuracy = np.mean(cv_scores)
mean_cv_loss = np.mean(cv_losses)

print(f"\nAverage Cross-Validation Accuracy: {mean_cv_accuracy*100:.2f}%")
print(f"Average Cross-Validation Loss: {mean_cv_loss:.4f}")

# Train final model on entire training set
ann_model.fit(X_train, y_train, epochs=5, batch_size=16, verbose=0)

# Predict on the test set
y_pred_proba_ann = ann_model.predict(X_test)
y_pred_ann = (y_pred_proba_ann > 0.5).astype(int).flatten()

# Test set performance metrics
test_accuracy = accuracy_score(y_test, y_pred_ann)
conf_matrix = confusion_matrix(y_test, y_pred_ann)
precision = precision_score(y_test, y_pred_ann)
recall = recall_score(y_test, y_pred_ann)
f1 = f1_score(y_test, y_pred_ann)
mcc = matthews_corrcoef(y_test, y_pred_ann)
auc_score = roc_auc_score(y_test, y_pred_proba_ann)

tn, fp, fn, tp = conf_matrix.ravel()
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

print(f"\nTest Accuracy: {test_accuracy*100:.2f}%")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Specificity: {specificity:.4f}")
print(f"FNR: {fnr:.4f}")
print(f"FPR: {fpr:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"MCC: {mcc:.4f}")
print(f"AUC Score: {auc_score:.4f}")

##result
# 121/121 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step
# Fold 1 - Validation Accuracy: 97.84%, Validation Loss: 0.0632
# 121/121 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step
# Fold 2 - Validation Accuracy: 98.02%, Validation Loss: 0.0480
# 121/121 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step
# Fold 3 - Validation Accuracy: 98.39%, Validation Loss: 0.0428
# 121/121 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step
# Fold 4 - Validation Accuracy: 98.72%, Validation Loss: 0.0370
# 120/120 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step
# Fold 5 - Validation Accuracy: 98.59%, Validation Loss: 0.0403
# 120/120 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step
# Fold 6 - Validation Accuracy: 98.65%, Validation Loss: 0.0308
# 120/120 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step
# Fold 7 - Validation Accuracy: 98.96%, Validation Loss: 0.0368
# 120/120 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step
# Fold 8 - Validation Accuracy: 99.06%, Validation Loss: 0.0282
# 120/120 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step
# Fold 9 - Validation Accuracy: 99.01%, Validation Loss: 0.0363
# 120/120 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step
# Fold 10 - Validation Accuracy: 99.40%, Validation Loss: 0.0147

# Average Cross-Validation Accuracy: 98.66%
# Average Cross-Validation Loss: 0.0378
# 301/301 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step

# Test Accuracy: 97.39%
# Precision: 0.9759
# Recall: 0.9716
# Specificity: 0.9761
# FNR: 0.0284
# FPR: 0.0239
# F1 Score: 0.9738
# MCC: 0.9477
# AUC Score: 0.9938

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score, roc_curve, log_loss
from sklearn.preprocessing import StandardScaler
import numpy as np
import sys
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam

# Open a file to write the output
with open('ANN_CV.txt', 'w') as f:
    # Redirect stdout to the file
    sys.stdout = f

    # Load the dataset
    df = pd.read_csv('Merge_all_features_1639_with_class.csv')

    # Split into features and target variable
    X = df.drop(columns=df.columns[-1])
    y = df[df.columns[-1]]

    # Scale the features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # Split data into training and testing sets (80:20)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define a simple ANN model with two hidden layers
    def create_ann_model(input_dim):
        model = Sequential()
        model.add(Input(shape=(input_dim,)))  # Input layer
        model.add(Dense(64, activation='relu'))  # First hidden layer with 64 units
        model.add(Dense(32, activation='relu'))  # Second hidden layer with 32 units
        model.add(Dense(1, activation='sigmoid'))  # Output layer
        model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
        return model

    # Initialize the ANN model
    ann_model = create_ann_model(X_train.shape[1])

    # Set up KFold cross-validation with 10 folds
    kf = KFold(n_splits=10, shuffle=True, random_state=42)

    cv_scores = []
    cv_losses = []  # List to store losses for each fold
    fold = 1

    # Perform 10-fold cross-validation
    for train_index, val_index in kf.split(X_train):
        X_fold_train, X_fold_val = X_train[train_index], X_train[val_index]
        y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]
        
        ann_model.fit(X_fold_train, y_fold_train, epochs=10, batch_size=32, verbose=0)
        
        y_fold_pred_proba = ann_model.predict(X_fold_val)
        y_fold_pred = (y_fold_pred_proba > 0.5).astype(int).flatten()
        fold_accuracy = accuracy_score(y_fold_val, y_fold_pred)
        
        # Calculate the validation loss
        fold_loss = log_loss(y_fold_val, y_fold_pred_proba)
        
        cv_scores.append(fold_accuracy)
        cv_losses.append(fold_loss)
        
        print(f"Fold {fold} Validation Accuracy: {fold_accuracy*100:.2f}%, Validation Loss: {fold_loss:.4f}")
        fold += 1

    # Calculate the mean cross-validation accuracy and loss
    mean_cv_accuracy = np.mean(cv_scores)
    mean_cv_loss = np.mean(cv_losses)
    print(f"\nAverage Cross-Validation Accuracy: {mean_cv_accuracy*100:.2f}%")
    print(f"Average Cross-Validation Loss: {mean_cv_loss:.4f}")

    # Train the model on the entire training set
    ann_model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)

    # Predict on the test set
    y_pred_proba_ann = ann_model.predict(X_test)
    y_pred_ann = (y_pred_proba_ann > 0.5).astype(int).flatten()

    # Calculate the test set accuracy
    test_accuracy = accuracy_score(y_test, y_pred_ann)
    print(f"\nTest Set Accuracy: {test_accuracy*100:.2f}%")

    # Generate and print the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred_ann)
    print("\nConfusion Matrix:")
    print(conf_matrix)

    # Calculate and print additional metrics
    precision = precision_score(y_test, y_pred_ann)
    recall = recall_score(y_test, y_pred_ann)
    f1 = f1_score(y_test, y_pred_ann)
    mcc = matthews_corrcoef(y_test, y_pred_ann)
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba_ann)
    auc_score = roc_auc_score(y_test, y_pred_proba_ann)

    # Calculate specificity
    tn, fp, fn, tp = conf_matrix.ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    # Calculate false negative rate (FNR) and false positive rate (FPR)
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

    print(f"\nAccuracy: {test_accuracy*100:.2f}%")
    print(f"Precision: {precision:.4f}")
    print(f"Recall (Sensitivity): {recall:.4f}")
    print(f"Specificity: {specificity:.4f}")
    print(f"False Negative Rate (FNR): {fnr:.4f}")
    print(f"False Positive Rate (FPR): {fpr:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Matthews Correlation Coefficient (MCC): {mcc:.4f}")
    print(f"AUC Score: {auc_score:.4f}")

    # Reset stdout to default
    sys.stdout = sys.__stdout__

# Notify that results have been saved
print("Results have been saved to ANN_CV.txt")
