In [1]:
########## RF ########

import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Load the dataset
df = pd.read_csv('Merge_all_features_1639_with_class.csv')

# Split into features and target variable
X = df.drop(columns=df.columns[-1])
y = df[df.columns[-1]]

# Split data into training and testing sets (80:20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize RandomForestClassifier
clf_rf = RandomForestClassifier(random_state=42)

# Set up KFold cross-validation with 10 folds
kf = KFold(n_splits=10, shuffle=True, random_state=42)

cv_scores = []
fold = 1

# Perform 10-fold cross-validation
for train_index, val_index in kf.split(X_train):
    X_fold_train, X_fold_val = X_train.iloc[train_index], X_train.iloc[val_index]
    y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]
    
    clf_rf.fit(X_fold_train, y_fold_train)
    y_fold_pred = clf_rf.predict(X_fold_val)
    fold_accuracy = accuracy_score(y_fold_val, y_fold_pred)
    cv_scores.append(fold_accuracy)
    
    print(f"Fold {fold} Validation Accuracy: {fold_accuracy*100:.2f}%")
    fold += 1

# Calculate the mean cross-validation accuracy
mean_cv_accuracy = sum(cv_scores) / len(cv_scores)
print(f"\nAverage Cross-Validation Accuracy: {mean_cv_accuracy*100:.2f}%")

# Train the model on the entire training set
clf_rf.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = clf_rf.predict(X_test)

# Calculate the test set accuracy
test_accuracy = accuracy_score(y_test, y_pred_rf)
print(f"\nTest Set Accuracy: {test_accuracy*100:.2f}%")

# Generate and print the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_rf)
print("\nConfusion Matrix:")
print(conf_matrix)


Fold 1 Validation Accuracy: 97.50%
Fold 2 Validation Accuracy: 97.19%
Fold 3 Validation Accuracy: 97.58%
Fold 4 Validation Accuracy: 97.50%
Fold 5 Validation Accuracy: 97.50%
Fold 6 Validation Accuracy: 97.34%
Fold 7 Validation Accuracy: 97.47%
Fold 8 Validation Accuracy: 97.63%
Fold 9 Validation Accuracy: 97.55%
Fold 10 Validation Accuracy: 97.97%

Average Cross-Validation Accuracy: 97.52%

Test Set Accuracy: 97.40%

Confusion Matrix:
[[4632  175]
 [  75 4720]]


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score, roc_curve
import numpy as np
import time
import sys

# Open a file to write the output
with open('output_results.txt', 'w') as f:
    # Redirect stdout to the file
    sys.stdout = f

    # Load the dataset
    df = pd.read_csv('Merge_all_features_1639_with_class.csv')

    # Split into features and target variable
    X = df.drop(columns=df.columns[-1])
    y = df[df.columns[-1]]

    # Split data into training and testing sets (80:20)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize RandomForestClassifier
    clf_rf = RandomForestClassifier(random_state=42)

    # Set up KFold cross-validation with 10 folds
    kf = KFold(n_splits=10, shuffle=True, random_state=42)

    cv_scores = []
    cv_losses = []  # List to store losses for each fold
    fold = 1

    # Perform 10-fold cross-validation
    for train_index, val_index in kf.split(X_train):
        X_fold_train, X_fold_val = X_train.iloc[train_index], X_train.iloc[val_index]
        y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # Start timing the model training
        start_time = time.time()
        
        clf_rf.fit(X_fold_train, y_fold_train)
        
        # End timing the model training
        end_time = time.time()
        training_time = end_time - start_time
        
        y_fold_pred = clf_rf.predict(X_fold_val)
        fold_accuracy = accuracy_score(y_fold_val, y_fold_pred)
        
        # Calculate the validation loss
        y_fold_pred_proba = clf_rf.predict_proba(X_fold_val)
        fold_loss = log_loss(y_fold_val, y_fold_pred_proba)
        
        cv_scores.append(fold_accuracy)
        cv_losses.append(fold_loss)
        
        print(f"Fold {fold} Validation Accuracy: {fold_accuracy*100:.2f}%, Validation Loss: {fold_loss:.4f}, Training Time: {training_time:.2f} seconds")
        fold += 1

    # Calculate the mean cross-validation accuracy and loss
    mean_cv_accuracy = sum(cv_scores) / len(cv_scores)
    mean_cv_loss = sum(cv_losses) / len(cv_losses)
    print(f"\nAverage Cross-Validation Accuracy: {mean_cv_accuracy*100:.2f}%")
    print(f"Average Cross-Validation Loss: {mean_cv_loss:.4f}")

    # Start timing the final model training
    start_time = time.time()

    # Train the model on the entire training set
    clf_rf.fit(X_train, y_train)

    # End timing the final model training
    end_time = time.time()
    training_time = end_time - start_time

    # Predict on the test set
    y_pred_rf = clf_rf.predict(X_test)
    y_pred_proba_rf = clf_rf.predict_proba(X_test)[:, 1]  # Get the probabilities for the positive class

    # Calculate the test set accuracy
    test_accuracy = accuracy_score(y_test, y_pred_rf)
    print(f"\nTest Set Accuracy: {test_accuracy*100:.2f}% - Final Training Time: {training_time:.2f} seconds")

    # Generate and print the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred_rf)
    print("\nConfusion Matrix:")
    print(conf_matrix)

    # Calculate and print additional metrics
    precision = precision_score(y_test, y_pred_rf)
    recall = recall_score(y_test, y_pred_rf)
    f1 = f1_score(y_test, y_pred_rf)
    mcc = matthews_corrcoef(y_test, y_pred_rf)
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba_rf)
    auc_score = roc_auc_score(y_test, y_pred_proba_rf)

    # Calculate specificity
    tn, fp, fn, tp = conf_matrix.ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    # Calculate false negative rate (FNR) and false positive rate (FPR)
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

    print(f"\nAccuracy: {test_accuracy*100:.2f}%")
    print(f"Precision: {precision:.4f}")
    print(f"Recall (Sensitivity): {recall:.4f}")
    print(f"Specificity: {specificity:.4f}")
    print(f"False Negative Rate (FNR): {fnr:.4f}")
    print(f"False Positive Rate (FPR): {fpr:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Matthews Correlation Coefficient (MCC): {mcc:.4f}")
    print(f"AUC Score: {auc_score:.4f}")

    # Reset stdout to default
    sys.stdout = sys.__stdout__

# Notify that results have been saved
print("Results have been saved to output_results.txt")


In [None]:
########## SVM ###########

import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load the dataset
df = pd.read_csv('48006x45_RF_MI.csv')

# Split into features and target variable
X = df.drop(columns=df.columns[-1])
y = df[df.columns[-1]]

# Scaling is important for SVM
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data into training and testing sets (80:20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize an SVM Classifier with default parameters
clf_svm = SVC()

# 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

val_accuracies = []

for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Train the classifier on the training fold
    clf_svm.fit(X_train_fold, y_train_fold)
    
    # Predict on the validation fold
    y_val_pred = clf_svm.predict(X_val_fold)
    val_accuracy = accuracy_score(y_val_fold, y_val_pred)
    
    # Store the validation accuracy
    val_accuracies.append(val_accuracy)
    
    # Print fold results
    print(f"Fold {len(val_accuracies)}:")
    print(f"Validation Accuracy: {val_accuracy*100:.2f}%")
    print()

# Average validation accuracy
avg_val_accuracy = np.mean(val_accuracies)
print(f"Average Validation Accuracy: {avg_val_accuracy*100:.2f}%")

# Train the classifier on the entire training set
clf_svm.fit(X_train, y_train)

# Predict on the test set
y_pred_svm = clf_svm.predict(X_test)

# Calculate accuracy on the test set
test_accuracy = accuracy_score(y_test, y_pred_svm)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_svm)

# Print test results
print(f"Test Set Accuracy: {test_accuracy*100:.2f}%")
print("Confusion Matrix:")
print(conf_matrix)


In [None]:
########## LR ###########

import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load the dataset
df = pd.read_csv('48006x45_RF_MI.csv')

# Split into features and target variable
X = df.drop(columns=df.columns[-1])
y = df[df.columns[-1]]

# Scaling can help Logistic Regression converge more efficiently
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data into training and testing sets (80:20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a Logistic Regression Classifier
clf_lr = LogisticRegression(solver='liblinear', max_iter=1000)

# Set up 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Array to store validation accuracy scores
val_accuracies = []

# Perform cross-validation
for train_index, val_index in kf.split(X_train):
    X_train_cv, X_val_cv = X_train[train_index], X_train[val_index]
    y_train_cv, y_val_cv = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Train the classifier
    clf_lr.fit(X_train_cv, y_train_cv)
    
    # Predict on validation set
    y_val_pred = clf_lr.predict(X_val_cv)
    val_accuracy = accuracy_score(y_val_cv, y_val_pred)
    
    # Store validation accuracy
    val_accuracies.append(val_accuracy)
    
    # Print validation accuracy for this fold
    print(f"Fold Validation Accuracy: {val_accuracy*100:.2f}%")

# Calculate and print average validation accuracy
avg_val_accuracy = np.mean(val_accuracies)
print(f"Average Validation Accuracy: {avg_val_accuracy*100:.2f}%")

# Train the final model on the entire training set
clf_lr.fit(X_train, y_train)

# Predict on the test set
y_pred_lr = clf_lr.predict(X_test)

# Calculate the test accuracy
test_accuracy = accuracy_score(y_test, y_pred_lr)
print(f"Test Accuracy: {test_accuracy*100:.2f}%")

# Compute and print the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_lr)
print("Confusion Matrix:")
print(conf_matrix)


In [None]:
########## KNN ###########

import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load the dataset
df = pd.read_csv('48006x45_RF_MI.csv')

# Split into features and target variable
X = df.drop(columns=df.columns[-1])
y = df[df.columns[-1]]

# Scaling is crucial for KNN as it's a distance-based algorithm
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data into training and testing sets (80:20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a KNN Classifier
knn = KNeighborsClassifier(n_neighbors=5)

# Set up 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Array to store validation accuracy scores
val_accuracies = []

# Perform cross-validation
for train_index, val_index in kf.split(X_train):
    X_train_cv, X_val_cv = X_train[train_index], X_train[val_index]
    y_train_cv, y_val_cv = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Train the classifier
    knn.fit(X_train_cv, y_train_cv)
    
    # Predict on validation set
    y_val_pred = knn.predict(X_val_cv)
    val_accuracy = accuracy_score(y_val_cv, y_val_pred)
    
    # Store validation accuracy
    val_accuracies.append(val_accuracy)
    
    # Print validation accuracy for this fold
    print(f"Fold Validation Accuracy: {val_accuracy*100:.2f}%")

# Calculate and print average validation accuracy
avg_val_accuracy = np.mean(val_accuracies)
print(f"Average Validation Accuracy: {avg_val_accuracy*100:.2f}%")

# Train the final model on the entire training set
knn.fit(X_train, y_train)

# Predict on the test set
y_pred_knn = knn.predict(X_test)

# Calculate the test accuracy
test_accuracy = accuracy_score(y_test, y_pred_knn)
print(f"Test Accuracy: {test_accuracy*100:.2f}%")

# Compute and print the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_knn)
print("Confusion Matrix:")
print(conf_matrix)


In [None]:
########## NB ###########

import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load the dataset
df = pd.read_csv('48006x45_RF_MI.csv')

# Split into features and target variable
X = df.drop(columns=df.columns[-1])
y = df[df.columns[-1]]

# Scaling the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data into training and testing sets (80:20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Naive Bayes classifier
nb = GaussianNB()

# Set up 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Array to store validation accuracy scores
val_accuracies = []

# Perform cross-validation
for train_index, val_index in kf.split(X_train):
    X_train_cv, X_val_cv = X_train[train_index], X_train[val_index]
    y_train_cv, y_val_cv = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Train the classifier
    nb.fit(X_train_cv, y_train_cv)
    
    # Predict on validation set
    y_val_pred = nb.predict(X_val_cv)
    val_accuracy = accuracy_score(y_val_cv, y_val_pred)
    
    # Store validation accuracy
    val_accuracies.append(val_accuracy)
    
    # Print validation accuracy for this fold
    print(f"Fold Validation Accuracy: {val_accuracy*100:.2f}%")

# Calculate and print average validation accuracy
avg_val_accuracy = np.mean(val_accuracies)
print(f"Average Validation Accuracy: {avg_val_accuracy*100:.2f}%")

# Train the final model on the entire training set
nb.fit(X_train, y_train)

# Predict on the test set
y_pred_nb = nb.predict(X_test)

# Calculate the test accuracy
test_accuracy = accuracy_score(y_test, y_pred_nb)
print(f"Test Accuracy: {test_accuracy*100:.2f}%")

# Compute and print the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_nb)
print("Confusion Matrix:")
print(conf_matrix)


In [None]:
########## DT ###########

import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np

# Load the dataset
df = pd.read_csv('48006x45_RF_MI.csv')

# Split into features and target variable
X = df.drop(columns=df.columns[-1])
y = df[df.columns[-1]]

from sklearn.preprocessing import StandardScaler
# Scaling is crucial for DT
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data into training and testing sets (80:20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Decision Tree classifier
dt = DecisionTreeClassifier()

# Set up 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Array to store validation accuracy scores
val_accuracies = []

# Perform cross-validation
for train_index, val_index in kf.split(X_train):
    X_train_cv, X_val_cv = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_cv, y_val_cv = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Train the classifier
    dt.fit(X_train_cv, y_train_cv)
    
    # Predict on validation set
    y_val_pred = dt.predict(X_val_cv)
    val_accuracy = accuracy_score(y_val_cv, y_val_pred)
    
    # Store validation accuracy
    val_accuracies.append(val_accuracy)
    
    # Print validation accuracy for this fold
    print(f"Fold Validation Accuracy: {val_accuracy*100:.2f}%")

# Calculate and print average validation accuracy
avg_val_accuracy = np.mean(val_accuracies)
print(f"Average Validation Accuracy: {avg_val_accuracy*100:.2f}%")

# Train the final model on the entire training set
dt.fit(X_train, y_train)

# Predict on the test set
y_pred_dt = dt.predict(X_test)

# Calculate the test accuracy
test_accuracy = accuracy_score(y_test, y_pred_dt)
print(f"Test Accuracy: {test_accuracy*100:.2f}%")

# Compute and print the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_dt)
print("Confusion Matrix:")
print(conf_matrix)


In [None]:
########## GBDT ###########

import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np

# Load the dataset
df = pd.read_csv('48006x45_RF_MI.csv')

# Split into features and target variable
X = df.drop(columns=df.columns[-1])
y = df[df.columns[-1]]

# Scaling is crucial for GBDT
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data into training and testing sets (80:20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Gradient Boosting classifier
gbdt = GradientBoostingClassifier()

# Set up 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Array to store validation accuracies
val_accuracies = []

# Perform cross-validation
for train_index, val_index in kf.split(X_train):
    X_train_cv, X_val_cv = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_cv, y_val_cv = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Train the classifier
    gbdt.fit(X_train_cv, y_train_cv)
    
    # Predict on validation set
    y_val_pred = gbdt.predict(X_val_cv)
    
    # Calculate validation accuracy
    val_accuracy = accuracy_score(y_val_cv, y_val_pred)
    val_accuracies.append(val_accuracy)
    
    # Print accuracy for this fold
    print(f"Fold Validation Accuracy: {val_accuracy*100:.2f}%")

# Calculate and print average validation accuracy
avg_val_accuracy = np.mean(val_accuracies)
print(f"Average Validation Accuracy: {avg_val_accuracy*100:.2f}%")

# Train the final model on the entire training set
gbdt.fit(X_train, y_train)

# Predict on the test set
y_pred_gbdt = gbdt.predict(X_test)

# Calculate the test accuracy
test_accuracy = accuracy_score(y_test, y_pred_gbdt)
print(f"Test Accuracy: {test_accuracy*100:.2f}%")

# Compute and print the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_gbdt)
print("Confusion Matrix:")
print(conf_matrix)


In [None]:
########## LGBM ###########

import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, confusion_matrix
import lightgbm as lgb
import numpy as np

# Load the dataset
df = pd.read_csv('48006x45_RF_MI.csv')

# Split into features and target variable
X = df.drop(columns=df.columns[-1])
y = df[df.columns[-1]]

from sklearn.preprocessing import StandardScaler
# Scaling is crucial for LGBM
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data into training and testing sets (80:20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the LightGBM classifier
lgbm = lgb.LGBMClassifier()

# Set up 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Arrays to store validation accuracies
val_accuracies = []

# Perform cross-validation
for train_index, val_index in kf.split(X_train):
    X_train_cv, X_val_cv = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_cv, y_val_cv = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Train the classifier
    lgbm.fit(X_train_cv, y_train_cv)
    
    # Predict on validation set
    y_val_pred = lgbm.predict(X_val_cv)
    val_accuracy = accuracy_score(y_val_cv, y_val_pred)
    
    # Store validation accuracy
    val_accuracies.append(val_accuracy)
    
    # Print validation accuracy for this fold
    print(f"Fold Validation Accuracy: {val_accuracy*100:.2f}%")

# Calculate and print average validation accuracy
avg_val_accuracy = np.mean(val_accuracies)
print(f"Average Cross-Validation Accuracy: {avg_val_accuracy*100:.2f}%")

# Train the final model on the entire training set
lgbm.fit(X_train, y_train)

# Predict on the test set
y_pred_lgbm = lgbm.predict(X_test)

# Calculate the test accuracy
test_accuracy = accuracy_score(y_test, y_pred_lgbm)
print(f"Test Accuracy: {test_accuracy*100:.2f}%")

# Compute and print the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_lgbm)
print("Confusion Matrix:")
print(conf_matrix)


In [None]:
########## XGBoost ###########

import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, confusion_matrix
from xgboost import XGBClassifier
import numpy as np

# Load the dataset
df = pd.read_csv('48006x45_RF_MI.csv')

# Split into features and target variable
X = df.drop(columns=df.columns[-1])
y = df[df.columns[-1]]

from sklearn.preprocessing import StandardScaler
# Scaling is crucial for XGBoost
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data into training and testing sets (80:20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBoost classifier
xgb = XGBClassifier(use_label_encoder=False)  # Added parameter to suppress warnings

# Set up KFold cross-validation (10 folds, not stratified)
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# List to store validation accuracies for each fold
validation_accuracies = []

# Perform 10-fold cross-validation
for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    # Train the model on the training fold
    xgb.fit(X_train_fold, y_train_fold)
    
    # Predict on the validation fold
    y_val_pred = xgb.predict(X_val_fold)
    
    # Calculate validation accuracy
    val_accuracy = accuracy_score(y_val_fold, y_val_pred)
    validation_accuracies.append(val_accuracy)
    print(f"Fold Validation Accuracy: {val_accuracy*100:.2f}%")

# Calculate and print the average cross-validation accuracy
avg_val_accuracy = np.mean(validation_accuracies)
print(f"\nAverage Cross-Validation Accuracy: {avg_val_accuracy*100:.2f}%")

# Train the model on the entire training set
xgb.fit(X_train, y_train)

# Predict on the test set
y_pred_xgb = xgb.predict(X_test)

# Calculate the test accuracy
test_accuracy = accuracy_score(y_test, y_pred_xgb)
print(f"\nTest Accuracy: {test_accuracy*100:.2f}%")

# Print the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_xgb)
print(f"\nConfusion Matrix:\n{conf_matrix}")


In [None]:
########## CatBoost ###########

import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np

# Load the dataset
df = pd.read_csv('48006x45_RF_MI.csv')

# Split into features and target variable
X = df.drop(columns=df.columns[-1])
y = df[df.columns[-1]]

from sklearn.preprocessing import StandardScaler
# Scaling is crucial for CatBoost
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize CatBoost classifier
catboost = CatBoostClassifier(iterations=500, learning_rate=0.05, depth=7, verbose=0)

# Set up 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Store validation accuracies
validation_accuracies = []

# Perform 10-fold cross-validation
for fold, (train_index, val_index) in enumerate(kf.split(X_train)):
    # Use .iloc[] for integer-based indexing
    X_train_fold = X_train.iloc[train_index]
    X_val_fold = X_train.iloc[val_index]
    y_train_fold = y_train.iloc[train_index]
    y_val_fold = y_train.iloc[val_index]
    
    # Train the model on the current fold
    catboost.fit(X_train_fold, y_train_fold)
    
    # Predict on the validation set
    y_val_pred = catboost.predict(X_val_fold)
    
    # Calculate validation accuracy
    val_accuracy = accuracy_score(y_val_fold, y_val_pred)
    validation_accuracies.append(val_accuracy)
    
    print(f"Fold {fold+1} Validation Accuracy: {val_accuracy*100:.2f}%")

# Calculate and print the average cross-validation accuracy
avg_cv_accuracy = np.mean(validation_accuracies)
print(f"\nAverage Cross-Validation Accuracy: {avg_cv_accuracy*100:.2f}%")

# Train on the entire training set and evaluate on the test set
catboost.fit(X_train, y_train)
y_test_pred = catboost.predict(X_test)

# Calculate the test accuracy
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"\nTest Accuracy: {test_accuracy*100:.2f}%")

# Print the confusion matrix
conf_matrix = confusion_matrix(y_test, y_test_pred)
print("\nConfusion Matrix:")
print(conf_matrix)


In [None]:
########## AdaBoost ###########

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np

# Load the dataset
df = pd.read_csv('48006x45_RF_MI.csv')

# Split into features and target variable
X = df.drop(columns=df.columns[-1])
y = df[df.columns[-1]]

from sklearn.preprocessing import StandardScaler
# Scaling is crucial for AdaBoost
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data into training and testing sets (80:20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize AdaBoost classifier with a DecisionTree base estimator
base_estimator = DecisionTreeClassifier(max_depth=7)  # Base estimator
adaboost = AdaBoostClassifier(
    n_estimators=50,
    learning_rate=0.05,
    algorithm='SAMME',  # Use the SAMME algorithm
    random_state=42
)

# Perform 10-fold cross-validation on the training set
kf = KFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = cross_val_score(adaboost, X_train, y_train, cv=kf, scoring='accuracy')

# Print the validation accuracy for each fold
for i, score in enumerate(cv_scores):
    print(f"Fold {i+1} Validation Accuracy: {score*100:.2f}%")

# Calculate the average cross-validation accuracy
average_cv_accuracy = np.mean(cv_scores)
print(f"\nAverage Cross-Validation Accuracy: {average_cv_accuracy*100:.2f}%")

# Train the classifier on the entire training set
adaboost.fit(X_train, y_train)

# Predict on the test set
y_pred_ada = adaboost.predict(X_test)

# Calculate the test accuracy
test_accuracy = accuracy_score(y_test, y_pred_ada)
print(f"\nTest Accuracy: {test_accuracy*100:.2f}%")

# Print the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_ada)
print("\nConfusion Matrix:")
print(conf_matrix)


In [None]:
########## ANN ##########

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import BinaryAccuracy

# Load the dataset
df = pd.read_csv('48006x45_RF_MI.csv')

# Split into features and target variable
X = df.drop(columns=df.columns[-1])
y = df[df.columns[-1]]

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Initialize KFold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

cv_accuracies = []

# Define a function to create the ANN model
def create_model():
    model = Sequential()
    model.add(Dense(64, input_dim=X.shape[1], activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Use 'softmax' for multi-class classification
    model.compile(optimizer=Adam(), loss=BinaryCrossentropy(), metrics=[BinaryAccuracy()])
    return model

# Perform 10-fold cross-validation
for fold, (train_index, val_index) in enumerate(kf.split(X)):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # Create a new model instance for each fold
    model = create_model()

    # Train the model
    model.fit(X_train, y_train, epochs=20, batch_size=32, verbose=0, validation_data=(X_val, y_val))

    # Evaluate the model on the validation set
    val_loss, val_accuracy = model.evaluate(X_val, y_val, verbose=0)
    cv_accuracies.append(val_accuracy)

    print(f"Fold {fold+1} Validation Accuracy: {val_accuracy*100:.2f}%")

# Print average cross-validation accuracy
average_cv_accuracy = np.mean(cv_accuracies)
print(f"\nAverage Cross-Validation Accuracy: {average_cv_accuracy*100:.2f}%")

# Train final model on the entire training set (80% of data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = create_model()
model.fit(X_train, y_train, epochs=20, batch_size=32, verbose=1)

# Predict on the test set
y_pred_proba = model.predict(X_test)
y_pred = (y_pred_proba > 0.5).astype(int)

# Calculate the accuracy
test_accuracy = accuracy_score(y_test, y_pred)
print(f"\nTest Accuracy: {test_accuracy*100:.2f}%")

# Print the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)
