In [None]:
#Importing the necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os
import time

from sklearn.model_selection import GridSearchCV, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,  precision_score, recall_score, f1_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from imblearn.over_sampling import SMOTE
from openpyxl import load_workbook

In [None]:
#Reading the excel-document into a df
df = pd.read_excel('dataset_name.xlsx') 

#dropping unwanted columns from the df
df2=df.drop(columns = ["Column 1", "Column 2"])

In [None]:
#Splitting df into two based on ID
df2["ID"] = df2["ID"].astype(str)  #Making sure its a string
df_train = df2[df2["ID"].isin(["ID 1", "ID 2", "ID 3"])]
df_unseen = df2[df2["ID"] == "ID 4"]

In [None]:
#Defining which columns to keep av model input
X = df_train.drop(columns = ['Column 1', 'Column 2', 'Column 3'])

#Defining which columns are the model outputs
y = df_train['Column 4']

#Setting up K-Fold cross-validation to split the data into training and test sets
Kfold = KFold(n_splits=5, random_state=66, shuffle=True)

#Looping over the K-Fold splits, one train/test split is generated per fold
for train_index, test_index in Kfold.split(X, y):
    #Selecting the training and test data using the indices for this fold
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]



In [None]:
#Initializing the StandardScaler to standardize feature values
scaler = StandardScaler()

#Fitting the scaler on the training data and transforming it
X_train_scaled = scaler.fit_transform(X_train)

#Transforming the test data using the same scaler
X_test_scaled = scaler.transform(X_test)

In [None]:
#Defining the logistic regression model
LR = LogisticRegression(max_iter=5000, class_weight='balanced')

#Defining a grid of hyperparameters to search over
param_grid = [
    {'C': [1, 10, 50, 100],                               
        'solver': ['liblinear'],                  
        'penalty': ['l2'],                        
        'class_weight': ['balanced']},
    {'C': [1, 10, 50, 100],
        'solver': ['lbfgs', 'newton-cg'],        
        'penalty': ['l2'],
        'class_weight': ['balanced']},
    {'C': [1, 10, 50, 100],
        'solver': ['saga'],                       
        'penalty': ['l1'],                        
        'class_weight': ['balanced']}]

#Setting up GridSearchCV to tune the logistic regression model
CV_LR = GridSearchCV(LR, param_grid, cv=3)

#Fitting the model on the scaled training data
CV_LR.fit(X_train_scaled, y_train)

#Printing the best hyperparameter combination found
print('Best parameters: ', CV_LR.best_params_)


In [None]:
#Getting the best hyperparameters from the GridSearchCV results
best_params = CV_LR.best_params_

#Creating a new Logistic Regression model using the best hyperparameters
LR_best = LogisticRegression(**best_params, max_iter=2000)

#Fitting the model on the scaled training data
LR_best.fit(X_train_scaled, y_train)

#Making predictions on the training data
y_pred_train = LR_best.predict(X_train_scaled)

#Evaluating model performance on the training set
acc_train = accuracy_score(y_train, y_pred_train)                          
precision_train = precision_score(y_train, y_pred_train, average='weighted')  
recall_train = recall_score(y_train, y_pred_train, average='weighted')       
f1_train = f1_score(y_train, y_pred_train, average='weighted')               

#Printing the evaluation metrics
print("Accuracy:", acc_train)
print("Classification Report:\n", classification_report(y_train, y_pred_train, zero_division=1))
print("Confusion Matrix:\n", confusion_matrix(y_train, y_pred_train))


In [None]:
#Makeing predictions on the scaled test data using the tuned logistic regression model
y_pred_test = LR_best.predict(X_test_scaled)

#Calculating performance metrics on the test set
acc_test = accuracy_score(y_test, y_pred_test)                                 
precision_test = precision_score(y_test, y_pred_test, average='weighted')      
recall_test = recall_score(y_test, y_pred_test, average='weighted')            
f1_test = f1_score(y_test, y_pred_test, average='weighted')                  

#Printing the evaluation results for the test set
print("Accuracy:", acc_test)
print("Classification Report:\n", classification_report(y_test, y_pred_test, zero_division=1))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))


In [None]:
#Defining inputs and outputs for unseen dataset
X_unseen = df_unseen.drop(columns = ['Column 1', 'Column 2', 'Column 3')
y_unseen = df_unseen['Column 4']   

#Scaling the unseen features using the same scaler fitted on the training data
X_unseen_scaled = scaler.transform(X_unseen)

#Making predictions on the unseen data
y_pred_unseen = LR_best.predict(X_unseen_scaled)

#Calculating evaluation metrics for the unseen data
acc_unseen = accuracy_score(y_unseen, y_pred_unseen)                                
precision_unseen = precision_score(y_unseen, y_pred_unseen, average='weighted', zero_division=0)
recall_unseen = recall_score(y_unseen, y_pred_unseen, average='weighted', zero_division=0)
f1_unseen = f1_score(y_unseen, y_pred_unseen, average='weighted', zero_division=0)

#Printing performance metrics for the unseen data
print("Accuracy:", acc_unseen)
print("Classification Report:\n", classification_report(y_unseen, y_pred_unseen, zero_division=1))
print("Confusion Matrix:\n", confusion_matrix(y_unseen, y_pred_unseen))

In [None]:
#Defining a function to log model evaluation metrics and hyperparameters to an Excel-file of chosen name
def log_results(model_name, params, accuracy_train, precision_train, recall_train, f1_train,
                accuracy_test, precision_test, recall_test, f1_test,
                accuracy_unseen, precision_unseen, recall_unseen, f1_unseen,
                filename="Results.xlsx"):
    """ Logs classification model results in separate sheets within the same Excel file, ensuring appending works correctly. """

    #Creating DataFrame for this model run
    result = pd.DataFrame([{
        **params,  
        "Accuracy_Train": accuracy_train,
        "Precision_Train": precision_train,
        "Recall_Train": recall_train,
        "F1_Train": f1_train,
        "Accuracy_Test": accuracy_test,
        "Precision_Test": precision_test,
        "Recall_Test": recall_test,
        "F1_Test": f1_test,
        "Accuracy_Unseen": accuracy_unseen,
        "Precision_Unseen": precision_unseen,
        "Recall_Unseen": recall_unseen,
        "F1_Unseen": f1_unseen
    }])

    #Introducing a short delay to avoid file conflicts if running in multiple notebooks
    time.sleep(1)

    #Checking if the file exists
    file_exists = os.path.exists(filename)

    if not file_exists:
        #If file doesn't exist, creating a new one
        with pd.ExcelWriter(filename, engine="openpyxl", mode="w") as writer:
            result.to_excel(writer, sheet_name=model_name, index=False)
    else:
        #If file exists, loading it properly before appending
        try:
            with pd.ExcelWriter(filename, engine="openpyxl", mode="a", if_sheet_exists="overlay") as writer:
                #Reading existing sheet 
                try:
                    existing_df = pd.read_excel(filename, sheet_name=model_name, engine="openpyxl")
                    df_combined = pd.concat([existing_df, result], ignore_index=True)
                except (FileNotFoundError, ValueError):
                    df_combined = result  #Creating sheet if it does not exist

                #Saving results, ensuring correct appending
                df_combined.to_excel(writer, sheet_name=model_name, index=False)

        except PermissionError:
            #Printing error to warn user
            print(f"Error: Close the Excel file ({filename}) before running the script again.")

    #Printing a confirmation to ensure user results are logged
    print(f"Logged results for {model_name}: Train Accuracy={accuracy_train:.4f}, Test Accuracy={accuracy_test:.4f}, Unseen Accuracy={accuracy_unseen:.4f}")


In [None]:
#Using the function to log results from LR model
log_results(
    model_name="LR_stat",
    params=best_params,

    accuracy_train=acc_train,
    precision_train=precision_train,
    recall_train=recall_train,
    f1_train=f1_train,

    accuracy_test=acc_test,
    precision_test=precision_test,
    recall_test=recall_test,
    f1_test=f1_test,

    accuracy_unseen=acc_unseen,
    precision_unseen=precision_unseen,
    recall_unseen=recall_unseen,
    f1_unseen=f1_unseen
)
