In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
# File path
input_json_path = r'D:\FOLDER FROM THESIS\THESIS\Processed data\Training ML\train_after_correlationandPCA.json'

# Load the JSON file into a DataFrame
data = pd.read_json(input_json_path, orient='records', lines=True)

# adjust display settings to show all columns
with pd.option_context('display.max_columns', None, 'display.max_rows', None):
    print(data.head())


In [None]:
# List of additional categorical columns that need to be encoded
additional_categorical_columns = ['byg056Varmeinstallation', 'eta006BygningensEtagebetegnelse', 
                                  'landscape', 'TSYM', 'byg021BygningensAnvendelse_grouped']

# Perform one-hot encoding for the additional categorical columns
data_encoded = pd.get_dummies(data, columns=additional_categorical_columns)

# Display the first few rows of the encoded DataFrame
with pd.option_context('display.max_columns', None, 'display.max_rows', None):
    print(data_encoded.head())


In [None]:
# List of original features from the importance list
original_features = [
    'Damage', 'areasqm_2', 'middelvind', 'b_div_c', 'eta020SamletArealAfEtage', 'height_mea_2',
    'frostdoegn', 'dagligmaxt', 'doegn2aars', 'byg026Opførelsesår', 'coast',
    'clay_depth', 'dtm20', 'bluespot', 'slope20', 'waterbodies',
    'groundwate', 'landmovelandmove_idw25', 'sand_depth'
]

# List of one-hot encoded categorical features from the importance list
encoded_categorical_features = [
    'byg056Varmeinstallation_9', 'byg021BygningensAnvendelse_grouped_99', 
    'byg056Varmeinstallation_1', 'byg021BygningensAnvendelse_grouped_14', 'eta006BygningensEtagebetegnelse_1', 
    'landscape_1'
]


# Combine the lists
all_relevant_features = original_features + encoded_categorical_features

# Filter the data_encoded DataFrame to keep only the relevant columns
data_encoded_filtered = data_encoded[all_relevant_features]

# Display the first few rows of the filtered DataFrame
with pd.option_context('display.max_columns', None, 'display.max_rows', None):
    print(data_encoded_filtered.head())


In [None]:
# Split the data into features (X) and target (y)
X = data_encoded_filtered.drop(columns=['Damage'])
y = data_encoded_filtered['Damage']

# Splitting the data into training and testing sets
# Keeping 30% of the data for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# The variables X_train, X_test, y_train, y_test are  ready for use in model training and evaluation

In [None]:
#The following part is a selection of scripts for each model's cross validation

In [None]:
#Random Forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, roc_curve, auc
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt

# Specified parameters for Random Forest
n_estimators = 200
max_depth = 60
min_samples_split = 4
min_samples_leaf = 1

# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Convert y_train to a numpy array
y_train_array = y_train.to_numpy()

# Initialize and configure the Random Forest model
rf_model = RandomForestClassifier(
    n_estimators=n_estimators,
    max_depth=max_depth,
    min_samples_split=min_samples_split,
    min_samples_leaf=min_samples_leaf,
    random_state=None
)

# Define the k-fold cross-validation procedure
cv = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

# Initialize lists to store the metrics for each class and overall accuracy
metrics_class_0 = {'precision': [], 'recall': [], 'f1-score': [], 'support': []}
metrics_class_1 = {'precision': [], 'recall': [], 'f1-score': [], 'support': []}
overall_accuracy = []
pr_aucs = []
roc_aucs = []
 
# Iterate over each fold
for i, (train_index, test_index) in enumerate(cv.split(X_train_scaled, y_train_array)):
    # Split data into training and test sets
    X_train_fold, X_test_fold = X_train_scaled[train_index], X_train_scaled[test_index]
    y_train_fold, y_test_fold = y_train_array[train_index], y_train_array[test_index]

    # Fit the model
    rf_model.fit(X_train_fold, y_train_fold)

    # Make predictions
    y_pred = rf_model.predict(X_test_fold)
    y_probs = rf_model.predict_proba(X_test_fold)[:, 1]

    # Compute classification report
    report = classification_report(y_test_fold, y_pred, output_dict=True)

    # Extract metrics for each class
    metrics_class_0['precision'].append(report['0']['precision'])
    metrics_class_0['recall'].append(report['0']['recall'])
    metrics_class_0['f1-score'].append(report['0']['f1-score'])
    metrics_class_0['support'].append(report['0']['support'])

    metrics_class_1['precision'].append(report['1']['precision'])
    metrics_class_1['recall'].append(report['1']['recall'])
    metrics_class_1['f1-score'].append(report['1']['f1-score'])
    metrics_class_1['support'].append(report['1']['support'])

    overall_accuracy.append(report['accuracy'])

    # Print results for each fold
    print(f"Fold {i+1} Classification Report:")
    print(classification_report(y_test_fold, y_pred))
    print(f"Confusion Matrix:\n{confusion_matrix(y_test_fold, y_pred)}\n")

    # PR and ROC curves
    precision, recall, _ = precision_recall_curve(y_test_fold, y_probs)
    pr_auc = auc(recall, precision)
    fpr, tpr, _ = roc_curve(y_test_fold, y_probs)
    roc_auc = auc(fpr, tpr)
    pr_aucs.append(pr_auc)
    roc_aucs.append(roc_auc)

    # Plot the Precision-Recall curve
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.plot(recall, precision, label=f'PR curve (AUC = {pr_auc:.2f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall Curve for Fold {i+1}')
    plt.legend(loc="lower left")

    # Plot the ROC curve
    plt.subplot(1, 2, 2)
    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve for Fold {i+1}')
    plt.legend(loc="lower right")
    plt.show()

    overall_accuracy.append(report['accuracy'])

# Calculate average metrics for each class
avg_metrics_class_0 = {metric: np.mean(values) for metric, values in metrics_class_0.items()}
avg_metrics_class_1 = {metric: np.mean(values) for metric, values in metrics_class_1.items()}
avg_accuracy = np.mean(overall_accuracy)

# Print average metrics
print("Average Metrics Across All Folds:")
print(f"Class 0 - Precision: {avg_metrics_class_0['precision']}, Recall: {avg_metrics_class_0['recall']}, F1-score: {avg_metrics_class_0['f1-score']}")
print(f"Class 1 - Precision: {avg_metrics_class_1['precision']}, Recall: {avg_metrics_class_1['recall']}, F1-score: {avg_metrics_class_1['f1-score']}")
print(f"Overall Accuracy: {avg_accuracy}")
# Print average PR AUC and ROC AUC
print(f"Average PR AUC Across All Folds: {np.mean(pr_aucs)}")
print(f"Average ROC AUC Across All Folds: {np.mean(roc_aucs)}")


In [None]:
#Stacked Generalization

from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.tree import DecisionTreeClassifier

# Base learners with specific configurations
base_learners = [
    ('rf', RandomForestClassifier(n_estimators=200, max_depth=60, min_samples_split=4, min_samples_leaf=1)),
    ('lgbm', lgb.LGBMClassifier(n_estimators=250, learning_rate=0.15, max_depth=9, num_leaves=100)),
    ('gnb', GaussianNB())
]

# Final estimator
final_estimator = RidgeClassifier()

# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
y_train_array = y_train.to_numpy()

# Define the k-fold cross-validation procedure
cv = StratifiedKFold(n_splits=5, shuffle=False)

# Initialize and configure the Stacking model
stacking_model = StackingClassifier(estimators=base_learners, final_estimator=final_estimator, cv=10)

# Lists to store metrics
metrics_class_0 = {'precision': [], 'recall': [], 'f1-score': [], 'support': []}
metrics_class_1 = {'precision': [], 'recall': [], 'f1-score': [], 'support': []}
overall_accuracy = []

# Iterate over each fold
for i, (train_index, test_index) in enumerate(cv.split(X_train_scaled, y_train_array)):
    X_train_fold, X_test_fold = X_train_scaled[train_index], X_train_scaled[test_index]
    y_train_fold, y_test_fold = y_train_array[train_index], y_train_array[test_index]

    # Fit the model
    stacking_model.fit(X_train_fold, y_train_fold)

    # Make predictions
    y_pred = stacking_model.predict(X_test_fold)

    # Compute classification report
    report = classification_report(y_test_fold, y_pred, output_dict=True)

    # Extract metrics for each class
    for class_label in ['0', '1']:
        metrics_class_0['precision'].append(report[class_label]['precision'])
        metrics_class_0['recall'].append(report[class_label]['recall'])
        metrics_class_0['f1-score'].append(report[class_label]['f1-score'])
        metrics_class_0['support'].append(report[class_label]['support'])

    overall_accuracy.append(report['accuracy'])

    # Print results for each fold
    print(f"Fold {i+1} Classification Report:")
    print(classification_report(y_test_fold, y_pred))
    print(f"Confusion Matrix:\n{confusion_matrix(y_test_fold, y_pred)}\n")

# Calculate average metrics for each class
avg_metrics_class_0 = {metric: np.mean(values) for metric, values in metrics_class_0.items()}
avg_metrics_class_1 = {metric: np.mean(values) for metric, values in metrics_class_1.items()}
avg_accuracy = np.mean(overall_accuracy)

# Print average metrics
print("Average Metrics Across All Folds:")
print(f"Class 0 - Precision: {avg_metrics_class_0['precision']}, Recall: {avg_metrics_class_0['recall']}, F1-score: {avg_metrics_class_0['f1-score']}")
print(f"Class 1 - Precision: {avg_metrics_class_1['precision']}, Recall: {avg_metrics_class_1['recall']}, F1-score: {avg_metrics_class_1['f1-score']}")
print(f"Overall Accuracy: {avg_accuracy}")


In [None]:
#Adaboost

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import numpy as np

# Specified parameters for AdaBoost
base_depth = 6
n_estimators = 200
learning_rate = 0.05

# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
y_train_array = y_train.to_numpy()

# Initialize the base estimator for AdaBoost
base_estimator = DecisionTreeClassifier(max_depth=base_depth)

# Define the k-fold cross-validation procedure
cv = StratifiedKFold(n_splits=5, shuffle=False)

# Initialize lists to store metrics
metrics_class_0 = {'precision': [], 'recall': [], 'f1-score': []}
metrics_class_1 = {'precision': [], 'recall': [], 'f1-score': []}
overall_accuracy = []

# Iterate over each fold
for i, (train_index, test_index) in enumerate(cv.split(X_train_scaled, y_train_array)):
    X_train_fold, X_test_fold = X_train_scaled[train_index], X_train_scaled[test_index]
    y_train_fold, y_test_fold = y_train_array[train_index], y_train_array[test_index]

    # Initialize and configure the AdaBoost model
    ada_model = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=n_estimators, learning_rate=learning_rate)

    # Fit the model
    ada_model.fit(X_train_fold, y_train_fold)

    # Make predictions
    y_pred = ada_model.predict(X_test_fold)

    # Compute classification report
    report = classification_report(y_test_fold, y_pred, output_dict=True, zero_division=0)

    # Extract metrics for each class
    for class_label in ['0', '1']:
        if class_label in report:
            metrics_class_0['precision'].append(report[class_label]['precision'])
            metrics_class_0['recall'].append(report[class_label]['recall'])
            metrics_class_0['f1-score'].append(report[class_label]['f1-score'])
        else:
            metrics_class_0['precision'].append(np.nan)
            metrics_class_0['recall'].append(np.nan)
            metrics_class_0['f1-score'].append(np.nan)

    overall_accuracy.append(report['accuracy'])

    # Print results for each fold
    print(f"Fold {i+1} Classification Report:")
    print(classification_report(y_test_fold, y_pred))
    print(f"Confusion Matrix:\n{confusion_matrix(y_test_fold, y_pred)}\n")

# Calculate average metrics for each class
avg_metrics_class_0 = {metric: np.nanmean(values) for metric, values in metrics_class_0.items()}
avg_metrics_class_1 = {metric: np.nanmean(values) for metric, values in metrics_class_1.items()}
avg_accuracy = np.nanmean(overall_accuracy)

# Print average metrics
print("Average Metrics Across All Folds:")
print(f"Class 0 - Precision: {avg_metrics_class_0['precision']}, Recall: {avg_metrics_class_0['recall']}, F1-score: {avg_metrics_class_0['f1-score']}")
print(f"Class 1 - Precision: {avg_metrics_class_1['precision']}, Recall: {avg_metrics_class_1['recall']}, F1-score: {avg_metrics_class_1['f1-score']}")
print(f"Overall Accuracy: {avg_accuracy}")


In [None]:
#Gradient Boosting

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import numpy as np

# Specified parameters for Gradient Boosting
n_estimators = 900
learning_rate = 0.07333333333333333
max_depth = 8
min_samples_split = 10
min_samples_leaf = 5

# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Convert y_train to a numpy array
y_train_array = y_train.to_numpy()

# Initialize and configure the Gradient Boosting model
gb_model = GradientBoostingClassifier(
    n_estimators=n_estimators,
    learning_rate=learning_rate,
    max_depth=max_depth,
    min_samples_split=min_samples_split,
    min_samples_leaf=min_samples_leaf,
    random_state=None
)

# Define the k-fold cross-validation procedure
cv = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

# Initialize lists to store the metrics for each class
metrics_class_0 = {'precision': [], 'recall': [], 'f1-score': [], 'support': []}
metrics_class_1 = {'precision': [], 'recall': [], 'f1-score': [], 'support': []}
overall_accuracy = []

# Iterate over each fold
for i, (train_index, test_index) in enumerate(cv.split(X_train_scaled, y_train_array)):
    # Split data into training and test sets
    X_train_fold, X_test_fold = X_train_scaled[train_index], X_train_scaled[test_index]
    y_train_fold, y_test_fold = y_train_array[train_index], y_train_array[test_index]

    # Fit the model
    gb_model.fit(X_train_fold, y_train_fold)

    # Make predictions
    y_pred = gb_model.predict(X_test_fold)

    # Compute classification report
    report = classification_report(y_test_fold, y_pred, output_dict=True)

    # Extract metrics for each class
    metrics_class_0['precision'].append(report['0']['precision'])
    metrics_class_0['recall'].append(report['0']['recall'])
    metrics_class_0['f1-score'].append(report['0']['f1-score'])
    metrics_class_0['support'].append(report['0']['support'])

    metrics_class_1['precision'].append(report['1']['precision'])
    metrics_class_1['recall'].append(report['1']['recall'])
    metrics_class_1['f1-score'].append(report['1']['f1-score'])
    metrics_class_1['support'].append(report['1']['support'])

    overall_accuracy.append(report['accuracy'])

    # Print results for each fold
    print(f"Fold {i+1} Classification Report:")
    print(classification_report(y_test_fold, y_pred))
    print(f"Confusion Matrix:\n{confusion_matrix(y_test_fold, y_pred)}\n")

# Calculate average metrics for each class
avg_metrics_class_0 = {metric: np.mean(values) for metric, values in metrics_class_0.items()}
avg_metrics_class_1 = {metric: np.mean(values) for metric, values in metrics_class_1.items()}
avg_accuracy = np.mean(overall_accuracy)

# Print average metrics
print("Average Metrics Across All Folds:")
print(f"Class 0 - Precision: {avg_metrics_class_0['precision']}, Recall: {avg_metrics_class_0['recall']}, F1-score: {avg_metrics_class_0['f1-score']}")
print(f"Class 1 - Precision: {avg_metrics_class_1['precision']}, Recall: {avg_metrics_class_1['recall']}, F1-score: {avg_metrics_class_1['f1-score']}")
print(f"Overall Accuracy: {avg_accuracy}")


In [None]:
#ANN

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, roc_curve, auc
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Specified parameters for ANN
layers = (128,)
batch_size = 128
epochs = 20
dropout_rate = 0.1

# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert y_train to a numpy array
y_train_array = y_train.to_numpy()

# Define the k-fold cross-validation procedure
cv = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

# Initialize lists to store metrics and AUCs
overall_accuracy = []
pr_aucs = []
roc_aucs = []

# Iterate over each fold
for i, (train_index, test_index) in enumerate(cv.split(X_train_scaled, y_train_array)):
    # Split data into training and test sets
    X_train_fold, X_test_fold = X_train_scaled[train_index], X_train_scaled[test_index]
    y_train_fold, y_test_fold = y_train_array[train_index], y_train_array[test_index]

    # Create and compile the model
    model = Sequential()
    for layer_size in layers:
        model.add(Dense(layer_size, activation='relu'))
        model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    model.fit(X_train_fold, y_train_fold, epochs=epochs, batch_size=batch_size, verbose=0)

    # Evaluate the model
    y_probs = model.predict(X_test_fold).ravel()
    y_pred = (y_probs > 0.5).astype(int)
    report = classification_report(y_test_fold, y_pred, output_dict=True)
    overall_accuracy.append(report['accuracy'])

    # Print results for each fold
    print(f"Fold {i+1} Classification Report:")
    print(classification_report(y_test_fold, y_pred))
    print(f"Confusion Matrix:\n{confusion_matrix(y_test_fold, y_pred)}\n")

    # PR and ROC curves
    precision, recall, _ = precision_recall_curve(y_test_fold, y_probs)
    pr_auc = auc(recall, precision)
    fpr, tpr, _ = roc_curve(y_test_fold, y_probs)
    roc_auc = auc(fpr, tpr)
    pr_aucs.append(pr_auc)
    roc_aucs.append(roc_auc)

    # Plot the Precision-Recall curve
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.plot(recall, precision, label=f'PR curve (AUC = {pr_auc:.2f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall Curve for Fold {i+1}')
    plt.legend(loc="lower left")

    # Plot the ROC curve
    plt.subplot(1, 2, 2)
    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve for Fold {i+1}')
    plt.legend(loc="lower right")
    plt.show()

# Calculate average accuracy and AUC
avg_accuracy = np.mean(overall_accuracy)
avg_pr_auc = np.mean(pr_aucs)
avg_roc_auc = np.mean(roc_aucs)

# Print average metrics
print("Average Metrics Across All Folds:")
print(f"Overall Accuracy: {avg_accuracy}")
print(f"Average PR AUC Across All Folds: {avg_pr_auc}")
print(f"Average ROC AUC Across All Folds: {avg_roc_auc}")


In [None]:
#XGBoost

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, roc_curve, auc
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb

# XGBoost parameters
params = {
    'n_estimators': 800,
    'learning_rate': 0.2,
    'max_depth': 2,
    'subsample': 0.6,
    'colsample_bytree': 0.1,
    'gamma': 0.2,
    'min_child_weight': 6,
    'reg_alpha': 0.1,
    'reg_lambda': 0.2
}

# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert y_train to a numpy array
y_train_array = y_train.to_numpy()

# Define the k-fold cross-validation procedure
cv = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

# Initialize lists to store metrics and AUCs
overall_accuracy = []
pr_aucs = []
roc_aucs = []

# Iterate over each fold
for i, (train_index, test_index) in enumerate(cv.split(X_train_scaled, y_train_array)):
    # Split data into training and test sets
    X_train_fold, X_test_fold = X_train_scaled[train_index], X_train_scaled[test_index]
    y_train_fold, y_test_fold = y_train_array[train_index], y_train_array[test_index]

    # Create and train the XGBoost model
    xgb_model = xgb.XGBClassifier(**params)
    xgb_model.fit(X_train_fold, y_train_fold, eval_metric='logloss')

    # Make predictions
    y_pred = xgb_model.predict(X_test_fold)
    y_probs = xgb_model.predict_proba(X_test_fold)[:, 1]

    # Compute classification report and update metrics
    report = classification_report(y_test_fold, y_pred, output_dict=True)
    overall_accuracy.append(report['accuracy'])

    # Print results for each fold
    print(f"Fold {i+1} Classification Report:")
    print(classification_report(y_test_fold, y_pred))
    print(f"Confusion Matrix:\n{confusion_matrix(y_test_fold, y_pred)}\n")

    # PR and ROC curves
    precision, recall, _ = precision_recall_curve(y_test_fold, y_probs)
    pr_auc = auc(recall, precision)
    fpr, tpr, _ = roc_curve(y_test_fold, y_probs)
    roc_auc = auc(fpr, tpr)
    pr_aucs.append(pr_auc)
    roc_aucs.append(roc_auc)

    # Plot the Precision-Recall curve
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.plot(recall, precision, label=f'PR curve (AUC = {pr_auc:.2f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall Curve for Fold {i+1}')
    plt.legend(loc="lower left")

    # Plot the ROC curve
    plt.subplot(1, 2, 2)
    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve for Fold {i+1}')
    plt.legend(loc="lower right")
    plt.show()

# Calculate average accuracy and AUC
avg_accuracy = np.mean(overall_accuracy)
avg_pr_auc = np.mean(pr_aucs)
avg_roc_auc = np.mean(roc_aucs)

# Print average metrics
print("Average Metrics Across All Folds:")
print(f"Overall Accuracy: {avg_accuracy}")
print(f"Average PR AUC Across All Folds: {avg_pr_auc}")
print(f"Average ROC AUC Across All Folds: {avg_roc_auc}")


In [None]:
#LightGBM 
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, roc_curve, auc
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb

# LightGBM parameters
params = {
    'n_estimators': 250,
    'learning_rate': 0.15,
    'max_depth': 9,
    'num_leaves': 100
}

# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert y_train to a numpy array
y_train_array = y_train.to_numpy()

# Define the k-fold cross-validation procedure
cv = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

# Initialize lists to store metrics and AUCs
overall_accuracy = []
pr_aucs = []
roc_aucs = []

# Iterate over each fold
for i, (train_index, test_index) in enumerate(cv.split(X_train_scaled, y_train_array)):
    # Split data into training and test sets
    X_train_fold, X_test_fold = X_train_scaled[train_index], X_train_scaled[test_index]
    y_train_fold, y_test_fold = y_train_array[train_index], y_train_array[test_index]

    # Create and train the LightGBM model
    lgb_model = lgb.LGBMClassifier(**params)
    lgb_model.fit(X_train_fold, y_train_fold, eval_metric='logloss')

    # Make predictions
    y_pred = lgb_model.predict(X_test_fold)
    y_probs = lgb_model.predict_proba(X_test_fold)[:, 1]

    # Compute classification report and update metrics
    report = classification_report(y_test_fold, y_pred, output_dict=True)
    overall_accuracy.append(report['accuracy'])

    # Print results for each fold
    print(f"Fold {i+1} Classification Report:")
    print(classification_report(y_test_fold, y_pred))
    print(f"Confusion Matrix:\n{confusion_matrix(y_test_fold, y_pred)}\n")

    # PR and ROC curves
    precision, recall, _ = precision_recall_curve(y_test_fold, y_probs)
    pr_auc = auc(recall, precision)
    fpr, tpr, _ = roc_curve(y_test_fold, y_probs)
    roc_auc = auc(fpr, tpr)
    pr_aucs.append(pr_auc)
    roc_aucs.append(roc_auc)

    # Plot the Precision-Recall curve
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.plot(recall, precision, label=f'PR curve (AUC = {pr_auc:.2f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall Curve for Fold {i+1}')
    plt.legend(loc="lower left")

    # Plot the ROC curve
    plt.subplot(1, 2, 2)
    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve for Fold {i+1}')
    plt.legend(loc="lower right")
    plt.show()

# Calculate average accuracy and AUC
avg_accuracy = np.mean(overall_accuracy)
avg_pr_auc = np.mean(pr_aucs)
avg_roc_auc = np.mean(roc_aucs)

# Print average metrics
print("Average Metrics Across All Folds:")
print(f"Overall Accuracy: {avg_accuracy}")
print(f"Average PR AUC Across All Folds: {avg_pr_auc}")
print(f"Average ROC AUC Across All Folds: {avg_roc_auc}")


In [None]:
#Gaussian Naive Bayes

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, roc_curve, auc
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
import numpy as np
import matplotlib.pyplot as plt

# Best Gaussian Naive Bayes parameter
var_smoothing = 5.455594781168514e-06

# Normalize the features using StandardScaler
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert y_train to a numpy array
y_train_array = y_train.to_numpy()

# Define the k-fold cross-validation procedure
cv = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

# Initialize lists to store metrics and AUCs
overall_accuracy = []
pr_aucs = []
roc_aucs = []

# Iterate over each fold
for i, (train_index, test_index) in enumerate(cv.split(X_train_scaled, y_train_array)):
    # Split data into training and test sets
    X_train_fold, X_test_fold = X_train_scaled[train_index], X_train_scaled[test_index]
    y_train_fold, y_test_fold = y_train_array[train_index], y_train_array[test_index]

    # Create and train the Gaussian Naive Bayes model
    gnb_model = GaussianNB(var_smoothing=var_smoothing)
    gnb_model.fit(X_train_fold, y_train_fold)

    # Make predictions
    y_pred = gnb_model.predict(X_test_fold)
    y_probs = gnb_model.predict_proba(X_test_fold)[:, 1]

    # Compute classification report and update metrics
    report = classification_report(y_test_fold, y_pred, output_dict=True)
    overall_accuracy.append(report['accuracy'])

    # Print results for each fold
    print(f"Fold {i+1} Classification Report:")
    print(classification_report(y_test_fold, y_pred))
    print(f"Confusion Matrix:\n{confusion_matrix(y_test_fold, y_pred)}\n")

    # PR and ROC curves
    precision, recall, _ = precision_recall_curve(y_test_fold, y_probs)
    pr_auc = auc(recall, precision)
    fpr, tpr, _ = roc_curve(y_test_fold, y_probs)
    roc_auc = auc(fpr, tpr)
    pr_aucs.append(pr_auc)
    roc_aucs.append(roc_auc)

    # Plot the Precision-Recall curve
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.plot(recall, precision, label=f'PR curve (AUC = {pr_auc:.2f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall Curve for Fold {i+1}')
    plt.legend(loc="lower left")

    # Plot the ROC curve
    plt.subplot(1, 2, 2)
    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve for Fold {i+1}')
    plt.legend(loc="lower right")
    plt.show()

# Calculate average accuracy and AUC
avg_accuracy = np.mean(overall_accuracy)
avg_pr_auc = np.mean(pr_aucs)
avg_roc_auc = np.mean(roc_aucs)

# Print average metrics
print("Average Metrics Across All Folds:")
print(f"Overall Accuracy: {avg_accuracy}")
print(f"Average PR AUC Across All Folds: {avg_pr_auc}")
print(f"Average ROC AUC Across All Folds: {avg_roc_auc}")


In [None]:
#KNN

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, roc_curve, auc
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import matplotlib.pyplot as plt

# KNN parameters
n_neighbors = 7
weights = 'distance'
algorithm = 'ball_tree'

# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert y_train to a numpy array
y_train_array = y_train.to_numpy()

# Define the k-fold cross-validation procedure
cv = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

# Initialize lists to store metrics and AUCs
overall_accuracy = []
pr_aucs = []
roc_aucs = []

# Iterate over each fold
for i, (train_index, test_index) in enumerate(cv.split(X_train_scaled, y_train_array)):
    # Split data into training and test sets
    X_train_fold, X_test_fold = X_train_scaled[train_index], X_train_scaled[test_index]
    y_train_fold, y_test_fold = y_train_array[train_index], y_train_array[test_index]

    # Create and train the KNN model
    knn_model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm)
    knn_model.fit(X_train_fold, y_train_fold)

    # Make predictions
    y_pred = knn_model.predict(X_test_fold)
    y_probs = knn_model.predict_proba(X_test_fold)[:, 1]

    # Compute classification report and update metrics
    report = classification_report(y_test_fold, y_pred, output_dict=True)
    overall_accuracy.append(report['accuracy'])

    # Print results for each fold
    print(f"Fold {i+1} Classification Report:")
    print(classification_report(y_test_fold, y_pred))
    print(f"Confusion Matrix:\n{confusion_matrix(y_test_fold, y_pred)}\n")

    # PR and ROC curves
    precision, recall, _ = precision_recall_curve(y_test_fold, y_probs)
    pr_auc = auc(recall, precision)
    fpr, tpr, _ = roc_curve(y_test_fold, y_probs)
    roc_auc = auc(fpr, tpr)
    pr_aucs.append(pr_auc)
    roc_aucs.append(roc_auc)

    # Plot the Precision-Recall curve
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.plot(recall, precision, label=f'PR curve (AUC = {pr_auc:.2f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall Curve for Fold {i+1}')
    plt.legend(loc="lower left")

    # Plot the ROC curve
    plt.subplot(1, 2, 2)
    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve for Fold {i+1}')
    plt.legend(loc="lower right")
    plt.show()

# Calculate average accuracy and AUC
avg_accuracy = np.mean(overall_accuracy)
avg_pr_auc = np.mean(pr_aucs)
avg_roc_auc = np.mean(roc_aucs)

# Print average metrics
print("Average Metrics Across All Folds:")
print(f"Overall Accuracy: {avg_accuracy}")
print(f"Average PR AUC Across All Folds: {avg_pr_auc}")
print(f"Average ROC AUC Across All Folds: {avg_roc_auc}")
