In [1]:
#Improting all necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os
import time

from sklearn.model_selection import GridSearchCV, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,  precision_score, recall_score, f1_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from openpyxl import load_workbook

In [2]:
#Reading the excel-document into a df
df = pd.read_excel('dataset_name.xlsx')

#dropping unwanted columns from the df
df2=df.drop(columns = ["Column 1", "Column 2"])


In [3]:
#Splitting df into two based on ID
df2["ID"] = df2["ID"].astype(str)  #Making sure its a string
df_train = df2[df2["ID"].isin(["ID 1", "ID 2", "ID 3"])]
df_unseen = df2[df2["ID"] == "ID 4"]

In [4]:
#defining which columns to keep av model input
X = df_train.drop(columns = ['Column 1', 'Column 2', 'Column 3'])

#defining which columns are the model outputs
y = df_train['Column 4']


#defining inputs and outputs for unseen dataset
X_unseen = df_unseen.drop(columns = ['Column 1', 'Column 2', 'Column 3')
y_unseen = df_unseen['Column 4']


In [5]:
#Initializing a StandardScaler to normalize the features
scaler = StandardScaler()

#Fitting the scaler on the training data only to avoid data leakage
scaler.fit(X)

#Transforming both training data and unseen data using the fitted scaler
X_scaled = scaler.transform(X)
X_unseen_scaled = scaler.transform(X_unseen)

#Converting the scaled arrays back to DataFrames for easier column access later
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
X_unseen_scaled = pd.DataFrame(X_unseen_scaled, columns=X.columns)

#Setting up K-Fold cross-validation (with 10 splits, shuffling, and a fixed random seed for reproducibility)
k_fold = KFold(n_splits=10, random_state=66, shuffle=True)

#Getting the number of splits
k_fold.get_n_splits(X_scaled, y)

#Initializing a logistic regression model with increased max_iter to ensure convergence
LR = LogisticRegression(max_iter=4500)

#Looping through the K-Fold splits with a for-loop
for train_index, test_index in k_fold.split(X_scaled, y):
    #Splitting the scaled data into training and testing sets for this fold
    X_train, X_test = X_scaled.iloc[train_index], X_scaled.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    


In [16]:
#Defining the hyperparameter grid to search over.
#'C' is the regularization strength
#'solver' specifies the algorithm to use in the optimization problem
#'penalty' is the norm used in the penalization
param_grid = [
    {'C': [0.1, 1, 10], 'solver': ['liblinear', 'saga', 'lbfgs'], 'penalty': ['l2', 'l1']},
]

#Seting up GridSearchCV to perform hyperparameter tuning on the logistic regression model.
#cv=5: 5-fold cross-validation
#scoring='f1_weighted': use weighted F1-score as evaluation metric
#n_jobs=-1: use all available CPU cores for faster computation
CV_LR = GridSearchCV(LR, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)

#Fitting the model using the training data from one of the cross-validation splits
CV_LR.fit(X_train, y_train)

#Printing the best hyperparameter combination found by the grid search
print('Best parameters: ', CV_LR.best_params_)

Best parameters:  {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}


In [29]:
#Extracting the best hyperparameters found during GridSearchCV
best_params = CV_LR.best_params_

#Defining class weights to handle class imbalance
class_weights = 'balanced'

#Initializing a new Logistic Regression model using the best parameters from the grid search,
#Setting a higher max_iter to ensure convergence and using class_weight='balanced'
LR_best = LogisticRegression(**best_params, max_iter=3000, class_weight=class_weights)

#Fitting the model on the training data
LR_best.fit(X_train, y_train)

#Predicting on the training set to evaluate performance
pred_train = LR_best.predict(X_train)

#Calculating performance metrics on the training data
acc_train = accuracy_score(y_train, pred_train)                           #Overall accuracy
precision_train = precision_score(y_train, pred_train, average='weighted')#Weighted precision
recall_train = recall_score(y_train, pred_train, average='weighted')      #Weighted recall
f1_train = f1_score(y_train, pred_train, average='weighted')              #Weighted F1-score

#Printing metrics and evaluation details
print("Accuracy:", acc_train)
print("Classification Report:\n", classification_report(y_train, pred_train))
print("Confusion Matrix:\n", confusion_matrix(y_train, pred_train))

Accuracy: 0.6535947712418301
Classification Report:
               precision    recall  f1-score   support

           1       0.73      0.92      0.81        38
           2       0.69      0.85      0.76        26
           3       0.54      0.52      0.53        42
           4       0.66      0.45      0.53        47

    accuracy                           0.65       153
   macro avg       0.65      0.68      0.66       153
weighted avg       0.65      0.65      0.64       153

Confusion Matrix:
 [[35  1  1  1]
 [ 0 22  2  2]
 [10  2 22  8]
 [ 3  7 16 21]]


In [30]:
#Making predictions on the test data
pred_test = LR_best.predict(X_test)

#Calculating performance metrics on the test data
acc_test = accuracy_score(y_test, pred_test)                              #Overall accuracy on test set
precision_test = precision_score(y_test, pred_test, average='weighted')   #Weighted precision
recall_test = recall_score(y_test, pred_test, average='weighted')         #Weighted recall
f1_test = f1_score(y_test, pred_test, average='weighted')                 #Weighted F1-score

#Printing evaluation results
print("Accuracy:", accuracy_score(y_test, pred_test))                     
print("Classification Report:\n", classification_report(
    y_test, pred_test, zero_division=1))                                   
print("Confusion Matrix:\n", confusion_matrix(y_test, pred_test))        

Accuracy: 0.375
Classification Report:
               precision    recall  f1-score   support

           1       0.50      1.00      0.67         2
           2       0.40      0.67      0.50         3
           3       0.33      0.17      0.22         6
           4       0.25      0.20      0.22         5

    accuracy                           0.38        16
   macro avg       0.37      0.51      0.40        16
weighted avg       0.34      0.38      0.33        16

Confusion Matrix:
 [[2 0 0 0]
 [0 2 0 1]
 [1 2 1 2]
 [1 1 2 1]]


In [31]:
#Making predictions on the unseen data
pred_unseen = LR_best.predict(X_unseen_scaled)

#Calculating performance metrics on the unseen data
acc_unseen = accuracy_score(y_unseen, pred_unseen)                            #Overall accuracy on the unseen data
precision_unseen = precision_score(y_unseen, pred_unseen, average='weighted') #Weighted precision
recall_unseen = recall_score(y_unseen, pred_unseen, average='weighted')       #Weighted recall
f1_unseen = f1_score(y_unseen, pred_unseen, average='weighted')               #Weighted F1-score

#Printingevaluation results
print("Accuracy:", accuracy_score(y_unseen, pred_unseen))
print("Classification Report:\n", classification_report(y_unseen, pred_unseen, zero_division=1))
print("Confusion Matrix:\n", confusion_matrix(y_unseen, pred_unseen))

Accuracy: 0.21428571428571427
Classification Report:
               precision    recall  f1-score   support

           1       0.21      1.00      0.35        15
           2       1.00      0.00      0.00        26
           3       1.00      0.00      0.00        29

    accuracy                           0.21        70
   macro avg       0.74      0.33      0.12        70
weighted avg       0.83      0.21      0.08        70

Confusion Matrix:
 [[15  0  0]
 [26  0  0]
 [29  0  0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [32]:
#Defining a function to log model evaluation metrics and hyperparameters to an Excel-file of chosen name
def log_results(model_name, params, accuracy_train, precision_train, recall_train, f1_train,
                accuracy_test, precision_test, recall_test, f1_test,
                accuracy_unseen, precision_unseen, recall_unseen, f1_unseen,
                filename="Excel_name.xlsx"):
    """ Logs classification model results in separate sheets within the same Excel file, ensuring appending works correctly. """

    #Createing DataFrame for this model run
    result = pd.DataFrame([{
        **params,  #Storing hyperparameters
        "Accuracy_Train": accuracy_train,
        "Precision_Train": precision_train,
        "Recall_Train": recall_train,
        "F1_Train": f1_train,
        "Accuracy_Test": accuracy_test,
        "Precision_Test": precision_test,
        "Recall_Test": recall_test,
        "F1_Test": f1_test,
        "Accuracy_Unseen": accuracy_unseen,
        "Precision_Unseen": precision_unseen,
        "Recall_Unseen": recall_unseen,
        "F1_Unseen": f1_unseen
    }])

    #Introducing a short delay to avoid file conflicts if running in multiple notebooks
    time.sleep(1)

    #Checking if the file exists
    file_exists = os.path.exists(filename)

    if not file_exists:
        #If file doesn't exist, creating a new one
        with pd.ExcelWriter(filename, engine="openpyxl", mode="w") as writer:
            result.to_excel(writer, sheet_name=model_name, index=False)
    else:
        #If file exists, loading it before appending
        try:
            with pd.ExcelWriter(filename, engine="openpyxl", mode="a", if_sheet_exists="overlay") as writer:
                #Reading existing sheet
                try:
                    existing_df = pd.read_excel(filename, sheet_name=model_name, engine="openpyxl")
                    df_combined = pd.concat([existing_df, result], ignore_index=True)
                except (FileNotFoundError, ValueError):
                    df_combined = result  #If sheet does not exist, creating it

                #Saving results, ensuring correct appending
                df_combined.to_excel(writer, sheet_name=model_name, index=False)
        #Printing error to warn user
        except PermissionError:
            print(f"Error: Close the Excel file ({filename}) before running the script again.")
    #printing a confirmation to ensure user results are logged
    print(f"Logged results for {model_name}: Train Accuracy={accuracy_train:.4f}, Test Accuracy={accuracy_test:.4f}, Unseen Accuracy={accuracy_unseen:.4f}")



In [33]:
#Using the function to log results from LR model
log_results(
    model_name="LR",
    params=best_params,

    accuracy_train=acc_train,
    precision_train=precision_train,
    recall_train=recall_train,
    f1_train=f1_train,

    accuracy_test=acc_test,
    precision_test=precision_test,
    recall_test=recall_test,
    f1_test=f1_test,

    accuracy_unseen=acc_unseen,
    precision_unseen=precision_unseen,
    recall_unseen=recall_unseen,
    f1_unseen=f1_unseen
)

✅ Logged results for LR-Kfold: Train Accuracy=0.6536, Test Accuracy=0.3750, Unseen Accuracy=0.2143
