In [None]:
#Importing all necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os
import time

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,  precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from collections import Counter
from sklearn.decomposition import PCA
from openpyxl import load_workbook

In [None]:
#Reading the excel-document into a df
df = pd.read_excel('dataset_name.xlsx')

#dropping unwanted columns from the df
df2=df.drop(columns = ["Column 1", "Column 2"])

In [None]:
#Splitting df into two based on ID
df2["ID"] = df2["ID"].astype(str)  #Making sure its a string
df_train = df2[df2["ID"].isin(["ID 1", "ID 2", "ID 3"])]
df_unseen = df2[df2["ID"] == "ID 4"]

In [None]:
#Preparing PCA input by defining the columns to include in the transformation
columns = ['Column 1', 'Column 2', 'Column 3', 'Column 4', 'Column 5']

#Extracting and copying the selected columns from the training dataframe
data_train = df_train[columns].copy()

#Encoding 'ID' 
data_train['ID'] = data_train['ID'].astype('Category_column').cat.codes

#Initializing a standard scaler to normalize the data
scaler_pca = StandardScaler()

#Fitting the scaler on training data and apply the transformation
data_train_scaled = scaler_pca.fit_transform(data_train)

#Initializing PCA to reduce dimensionality to 4 PCs
pca = PCA(n_components=4)

#Fitting PCA on the scaled training data and transforming it
pca_train = pca.fit_transform(data_train_scaled)

#Adding the first 4 principal components (PC1 to PC4) as new columns to df_train
for i in range(4):
    df_train.loc[:, f'PC{i+1}'] = pca_train[:, i]


#Applying the same PCA transformation to the unseen dataset 

#Extracting and copying the same columns from the unseen dataset
data_unseen = df_unseen[columns].copy()

#Encode 'ID' 
data_unseen['ID'] = data_unseen['ID'].astype('Category_column').cat.codes

#Applying the same scaling 
data_unseen_scaled = scaler_pca.transform(data_unseen)

#Applying the same PCA transformation
pca_unseen = pca.transform(data_unseen_scaled)

#Adding PC1 to PC4 as new columns to df_unseen
for i in range(4):
    df_unseen.loc[:, f'PC{i+1}'] = pca_unseen[:, i]

In [None]:
#Preparing inputs and target values
X = df_train[['PC1', 'PC2', 'PC3', 'PC4']]
y = df_train['Category_column']

X_unseen = df_unseen[['PC1', 'PC2', 'PC3', 'PC4']]
y_unseen = df_unseen['Category_column']

In [None]:
#Setting up K-Fold cross-validation
k_fold = KFold(n_splits=10, random_state=66, shuffle=True)

#Initializing the Random Forest classifier with specific hyperparameters
RF = RandomForestClassifier(
    n_estimators=100,         
    max_depth=7,             
    random_state=66,          
    class_weight='balanced'    
)

#Creating lists to save evaluation metrics for each fold
accuracies = []
precisions = []
recalls = []
f1_scores = []

#Looping through each fold
for train_index, test_index in k_fold.split(X, y):
    #Splitting data into training and test sets for the current fold
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    #Skipping this fold if the test set contains only one class
    if len(np.unique(y_test)) == 1:
        print("Skipping this fold due to only one class in test set.")
        continue

    #Standardizing features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)    
    X_test_scaled = scaler.transform(X_test)        
    
    #Training the Random Forest model on the scaled training data
    RF.fit(X_train_scaled, y_train)

    #Predicting class labels on the scaled test data
    pred_test = RF.predict(X_test_scaled)

    #Evaluating the model on the test data and store metrics
    accuracies.append(accuracy_score(y_test, pred_test))
    precisions.append(precision_score(y_test, pred_test, average='weighted', zero_division=0.0))
    recalls.append(recall_score(y_test, pred_test, average='weighted', zero_division=0.0))
    f1_scores.append(f1_score(y_test, pred_test, average='weighted'))

#Printing the average and variability of performance metrics across all valid folds
print(f"Mean Accuracy: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")
print(f"Mean Precision: {np.mean(precisions):.4f}")
print(f"Mean Recall: {np.mean(recalls):.4f}")
print(f"Mean F1 Score: {np.mean(f1_scores):.4f}")


In [None]:
#Defining parameter grid for hyperparameter tuning 
parameters_grid = {
    'n_estimators': [75, 100, 125],      
    'max_features': ['sqrt', 'log2'],              
    'max_depth': [15,25,50],                 
    'min_samples_split': [5,10,15],           
    'min_samples_leaf': [10, 15,20],       
    'criterion': ['gini', 'entropy'],    
    'random_state': [66],                
    'class_weight': ['balanced']       
}

# tandardizing the full training feature set
scaler = StandardScaler()
X_scaled_full = scaler.fit_transform(X) 

#Defining a new Random Forest classifier without fixed hyperparameters
RF = RandomForestClassifier()

#Setting up GridSearchCV to search over
CV_RF = GridSearchCV(
    estimator=RF,                  
    param_grid=parameters_grid,  
    cv=k_fold,                    
    scoring='accuracy',       
    n_jobs=-1                  
)

#Running the grid search on the scaled training data
CV_RF.fit(X_scaled_full, y)

#Printing the best hyperparameter combination found during grid search
print('Best parameters: ', CV_RF.best_params_)


In [None]:
#Getting the best hyperparameter combination from the GridSearchCV results
best_params = CV_RF.best_params_

#Initializing a new RandomForestClassifier using hyperparameters found
RF_best = RandomForestClassifier(**best_params)

#Training the model on the scaled training data
RF_best.fit(X_train_scaled, y_train)

#Predicting target values from test data
pred_test = RF_best.predict(X_test_scaled)

#Predicting target values from train data
pred_train = RF_best.predict(X_train_scaled)

In [None]:
#Calculating performance metrics for the train data
acc_train = accuracy_score(y_train, pred_train)
precision_train = precision_score(y_train, pred_train, average='weighted')
recall_train = recall_score(y_train, pred_train, average='weighted')
f1_train = f1_score(y_train, pred_train, average='weighted')

#Printing evaluation metrics with 4 decimals
print(f'Train Accuracy: {acc_train:.4f}')
print(f'Train Precision: {precision_train:.4f}')
print(f'Train Recall: {recall_train:.4f}')
print(f'Train F1 Score: {f1_train:.4f}')


#Generating the confusion matrix
cm = confusion_matrix(y_train, pred_train)

#Create class labels based on y_train
class_labels = [f"Class {i}" for i in np.unique(y_train)]

#Plotting the confusion matrix as a heatmap
sns.heatmap(
    cm,
    annot=True,             
    fmt="d",               
    cmap="Blues",           
    xticklabels=class_labels,
    yticklabels=class_labels)

plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

In [None]:
#Calculating performance metrics on test data
acc_test = accuracy_score(y_test, pred_test)
precision_test = precision_score(y_test, pred_test, average='weighted')
recall_test = recall_score(y_test, pred_test, average='weighted', zero_division=0.0)
f1_test = f1_score(y_test, pred_test, average='weighted')

#Printing metrics with four decimals
print(f'Test Accuracy: {acc_test:.4f}')
print(f'Test Precision: {precision_test:.4f}')
print(f'Test Recall: {recall_test:.4f}')
print(f'Test F1 Score: {f1_test:.4f}')

#Defining the class labels
all_labels = np.unique(y)

#Generating the confusion matrix using the full set of class labels
cm = confusion_matrix(y_test, pred_test, labels=all_labels)

#Visualizing the confusion matrix as a heatmap
sns.heatmap(
    cm,
    annot=True,            
    fmt="d",                 
    cmap="Blues",           
    xticklabels=all_labels,  
    yticklabels=all_labels   )

plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

In [None]:
#Scaling the unseen input data using the same scaler fitted on the training set
X_unseen_scaled = scaler.transform(X_unseen)

#Using the best-trained Random Forest model to predict on the unseen data
y_pred_unseen = RF_best.predict(X_unseen_scaled)

#Calculating the performance metrics for the unsen data
acc_unseen = accuracy_score(y_unseen, y_pred_unseen)
precision_unseen = precision_score(y_unseen, y_pred_unseen, average='weighted', zero_division=0)
recall_unseen = recall_score(y_unseen, y_pred_unseen, average='weighted', zero_division=0)
f1_unseen = f1_score(y_unseen, y_pred_unseen, average='weighted', zero_division=0)

#Printing evaluation results
print(f'Unseen Accuracy: {acc_unseen:.4f}')
print(f'Unseen Precision: {precision_unseen:.4f}')
print(f'Unseen Recall: {recall_unseen:.4f}')
print(f'Unseen F1 Score: {f1_unseen:.4f}')

#Generating the confusion matrix for the unseen data
cm_unseen = confusion_matrix(y_unseen, y_pred_unseen)

#Defining class labels
class_labels = [f"Class {i}" for i in np.unique(y_train)]

#Visualizing the confusion matrix as a heatmap
sns.heatmap(
    cm_unseen,
    annot=True,             
    fmt="d",                
    cmap="Blues",          
    xticklabels=class_labels,
    yticklabels=class_labels)

plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

In [None]:
#Defining a function to log model evaluation metrics and hyperparameters to an Excel-file of chosen name
def log_results(model_name, params, accuracy_train, precision_train, recall_train, f1_train,
                accuracy_test, precision_test, recall_test, f1_test,
                accuracy_unseen, precision_unseen, recall_unseen, f1_unseen,
                filename="Results.xlsx"):
    """ Logs classification model results in separate sheets within the same Excel file, ensuring appending works correctly. """

    #Creating DataFrame for this model run
    result = pd.DataFrame([{
        **params, 
        "Accuracy_Train": accuracy_train,
        "Precision_Train": precision_train,
        "Recall_Train": recall_train,
        "F1_Train": f1_train,
        "Accuracy_Test": accuracy_test,
        "Precision_Test": precision_test,
        "Recall_Test": recall_test,
        "F1_Test": f1_test,
        "Accuracy_Unseen": accuracy_unseen,
        "Precision_Unseen": precision_unseen,
        "Recall_Unseen": recall_unseen,
        "F1_Unseen": f1_unseen
    }])

    #Introducing a short delay to avoid file conflicts if running in multiple notebooks
    time.sleep(1)

    #Checking if the file exists
    file_exists = os.path.exists(filename)

    if not file_exists:
        #If file doesn't exist, creating a new one
        with pd.ExcelWriter(filename, engine="openpyxl", mode="w") as writer:
            result.to_excel(writer, sheet_name=model_name, index=False)
    else:
        #If file exists, loading it properly before appending
        try:
            with pd.ExcelWriter(filename, engine="openpyxl", mode="a", if_sheet_exists="overlay") as writer:
                #Reading existing sheet 
                try:
                    existing_df = pd.read_excel(filename, sheet_name=model_name, engine="openpyxl")
                    df_combined = pd.concat([existing_df, result], ignore_index=True)
                except (FileNotFoundError, ValueError):
                    df_combined = result  #If sheet does not exist, creating it

                #Saving results, ensuring correct appending
                df_combined.to_excel(writer, sheet_name=model_name, index=False)

        except PermissionError:
             #Printing error to warn user
            print(f"Error: Close the Excel file ({filename}) before running the script again.")

    #Printing a confirmation to ensure user results are logged
    print(f"Logged results for {model_name}: Train Accuracy={accuracy_train:.4f}, Test Accuracy={accuracy_test:.4f}, Unseen Accuracy={accuracy_unseen:.4f}")



In [None]:
#Using the function to log results from RFC model
log_results(
    model_name="RFC_PCA",
    params=best_params,

    accuracy_train=acc_train,
    precision_train=precision_train,
    recall_train=recall_train,
    f1_train=f1_train,

    accuracy_test=acc_test,
    precision_test=precision_test,
    recall_test=recall_test,
    f1_test=f1_test,

    accuracy_unseen=acc_unseen,
    precision_unseen=precision_unseen,
    recall_unseen=recall_unseen,
    f1_unseen=f1_unseen
)
