In [None]:
#Importing all necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import time

from sklearn.model_selection import GridSearchCV, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,  precision_score, recall_score, f1_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from openpyxl import load_workbook

In [None]:
#Reading the excel-document into a df
df = pd.read_excel('dataset_name.xlsx')

#dropping unwanted columns from the df
df2=df.drop(columns = ["Column 1", "Column 2"])


In [None]:
#Splitting df into two based on ID
df2["ID"] = df2["ID"].astype(str)  #Making sure its a string
df_train = df2[df2["ID"].isin(["ID 1", "ID 2", "ID 3"])]
df_unseen = df2[df2["ID"] == "ID 4"]

In [None]:
#Preparing PCA input by defining the columns to include in the transformation
columns = ['Column 1', 'Column 2', 'Column 3', 'Column 4', 'Column 5']

#Extracting and copying the selected columns from the training dataframe
data_train = df_train[columns].copy()

#Encoding 'ID' 
data_train['ID'] = data_train['ID'].astype('Category_column').cat.codes

#Initializing a standard scaler to normalize the data
scaler_pca = StandardScaler()

#Fitting the scaler on training data and apply the transformation
data_train_scaled = scaler_pca.fit_transform(data_train)

#Initializing PCA to reduce dimensionality to 4 PCs
pca = PCA(n_components=4)

#Fitting PCA on the scaled training data and transforming it
pca_train = pca.fit_transform(data_train_scaled)

#Adding the first 4 principal components (PC1 to PC4) as new columns to df_train
for i in range(4):
    df_train.loc[:, f'PC{i+1}'] = pca_train[:, i]


#Applying the same PCA transformation to the unseen dataset 

#Extracting and copying the same columns from the unseen dataset
data_unseen = df_unseen[columns].copy()

#Encode 'ID' 
data_unseen['ID'] = data_unseen['ID'].astype('Category_column').cat.codes

#Applying the same scaling 
data_unseen_scaled = scaler_pca.transform(data_unseen)

#Applying the same PCA transformation
pca_unseen = pca.transform(data_unseen_scaled)

#Adding PC1 to PC4 as new columns to df_unseen
for i in range(4):
    df_unseen.loc[:, f'PC{i+1}'] = pca_unseen[:, i]

In [None]:
#Preparing inputs and target values
X = df_train[['PC1', 'PC2', 'PC3', 'PC4']]
y = df_train['Category_column']

X_unseen = df_unseen[['PC1', 'PC2', 'PC3', 'PC4']]
y_unseen = df_unseen['Category_column']

In [None]:
#Defining a new scaler that has not been fitted
scaler = StandardScaler()

#Fitting the scaler on the full feature set X
scaler.fit(X) 

#Transforming both training and unseen datasets using scaler
X_scaled = scaler.transform(X)
X_unseen_scaled = scaler.transform(X_unseen)

#Converting the scaled arrays back to DataFrames for readability
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
X_unseen_scaled = pd.DataFrame(X_unseen_scaled, columns=X.columns)


#Using K-Fold cross-validation to split the data into 10 folds
k_fold = KFold(n_splits=10, random_state=66, shuffle=True)  
k_fold.get_n_splits(X_scaled, y)  


#Initializing the LR model
LR = LogisticRegression(max_iter=4500) 

#Loopingrain_index, test_index in k_fold
    # Split the data based on the current fold
    X_train, X_test = X_scaled.iloc[train_index], X_scaled.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [None]:
#Defining the hyperparameter grid for logistic regression
param_grid = [
    {'C': [0.001, 0.01,0.05], 'solver': ['liblinear'], 'penalty': ['l2']},
    {'C': [0.001, 0.01,0.05],'solver': ['lbfgs', 'newton-cg'], 'penalty': ['l2']},
    {'C': [0.001, 0.01,0.05], 'solver': ['saga'], 'penalty': ['l1']}
]
#Initializing GridSearchCV to search over hyperparameter grid
CV_LR = GridSearchCV(LR, param_grid, cv=k_fold,scoring='f1_weighted', n_jobs = -1)
#Fitting to training data
CV_LR.fit(X_train, y_train)

#Printing the best combination of hyperparameters
print('Best parameters: ', CV_LR.best_params_)

In [None]:
#Extracting the best hyperparameters found by GridSearchCV
best_params = CV_LR.best_params_

#Initializing a new Logistic Regression model using the best parameters
LR_best = LogisticRegression(**best_params, max_iter=2000, class_weight='balanced')

#Fitting the best model to the training data
LR_best.fit(X_train, y_train)

#Making predictions on the training data
pred_train = LR_best.predict(X_train)

#Calculating performance metrics for the training data
acc_train = accuracy_score(y_train, pred_train)
precision_train = precision_score(y_train, pred_train, average='weighted', zero_division=1)
recall_train = recall_score(y_train, pred_train, average='weighted')
f1_train = f1_score(y_train, pred_train, average='weighted')

#Printing accuracy and full classification report
print("Accuracy:", acc_train)
print("Classification Report:\n", classification_report(y_train, pred_train, zero_division=1))

#Printing confusion matrix to see class-wise prediction breakdown
print("Confusion Matrix:\n", confusion_matrix(y_train, pred_train))

In [None]:
#Making predictions on the test data using the best logistic regression model
pred_test = LR_best.predict(X_test)

#Calculating performance metrics for the tset data
acc_test = accuracy_score(y_test, pred_test)
precision_test = precision_score(y_test, pred_test, average='weighted', zero_division=1)
recall_test = recall_score(y_test, pred_test, average='weighted', zero_division=1)
f1_test = f1_score(y_test, pred_test, average='weighted')

#Printing overall accuracy and classification report
print("Accuracy:", accuracy_score(y_test, pred_test))
print("Classification Report:\n", classification_report(y_test, pred_test, zero_division=1))

#Printing confusion matrix to visualize true vs. predicted class counts
print("Confusion Matrix:\n", confusion_matrix(y_test, pred_test))

In [None]:
#Making predictions on the unseen dataset using the trained logistic regression model
pred_unseen = LR_best.predict(X_unseen_scaled)

#Calculating performance metrics
acc_unseen = accuracy_score(y_unseen, pred_unseen)
precision_unseen = precision_score(y_unseen, pred_unseen, average='weighted', zero_division=1)
recall_unseen = recall_score(y_unseen, pred_unseen, average='weighted', zero_division=1)
f1_unseen = f1_score(y_unseen, pred_unseen, average='weighted')

#Printing overall accuracy and the classification report
print("Accuracy:", accuracy_score(y_unseen, pred_unseen))
print("Classification Report:\n", classification_report(y_unseen, pred_unseen, zero_division=1))

#Printing confusion matrix to see the distribution of predicted vs true classes
print("Confusion Matrix:\n", confusion_matrix(y_unseen, pred_unseen))

In [None]:
#Defining a function to log model evaluation metrics and hyperparameters to an Excel-file of chosen name
def log_results(model_name, params, accuracy_train, precision_train, recall_train, f1_train,
                accuracy_test, precision_test, recall_test, f1_test,
                accuracy_unseen, precision_unseen, recall_unseen, f1_unseen,
                filename="Results.xlsx"):
    """ Logs classification model results in separate sheets within the same Excel file, ensuring appending works correctly. """

    #Creating DataFrame for this model run
    result = pd.DataFrame([{
        **params, 
        "Accuracy_Train": accuracy_train,
        "Precision_Train": precision_train,
        "Recall_Train": recall_train,
        "F1_Train": f1_train,
        "Accuracy_Test": accuracy_test,
        "Precision_Test": precision_test,
        "Recall_Test": recall_test,
        "F1_Test": f1_test,
        "Accuracy_Unseen": accuracy_unseen,
        "Precision_Unseen": precision_unseen,
        "Recall_Unseen": recall_unseen,
        "F1_Unseen": f1_unseen
    }])

    #Introducing a short delay to avoid file conflicts if running in multiple notebooks
    time.sleep(1)

    #Checking if the file exists
    file_exists = os.path.exists(filename)

    if not file_exists:
        #If file doesn't exist, creating a new one
        with pd.ExcelWriter(filename, engine="openpyxl", mode="w") as writer:
            result.to_excel(writer, sheet_name=model_name, index=False)
    else:
        #If file exists, loading it 
        try:
            with pd.ExcelWriter(filename, engine="openpyxl", mode="a", if_sheet_exists="overlay") as writer:
                #Reading existing sheet
                try:
                    existing_df = pd.read_excel(filename, sheet_name=model_name, engine="openpyxl")
                    df_combined = pd.concat([existing_df, result], ignore_index=True)
                except (FileNotFoundError, ValueError):
                    df_combined = result  #If sheet does not exist, creating it

                #Saving results
                df_combined.to_excel(writer, sheet_name=model_name, index=False)

        except PermissionError:
            #Printing error to warn user
            print(f"Error: Close the Excel file ({filename}) before running the script again.")

    #Printing a confirmation to ensure user results are logged
    print(f"Logged results for {model_name}: Train Accuracy={accuracy_train:.4f}, Test Accuracy={accuracy_test:.4f}, Unseen Accuracy={accuracy_unseen:.4f}")



In [None]:
#Using the function to log results from RF classifier model
log_results(
    model_name="LR_PCA",
    params=best_params,

    accuracy_train=acc_train,
    precision_train=precision_train,
    recall_train=recall_train,
    f1_train=f1_train,

    accuracy_test=acc_test,
    precision_test=precision_test,
    recall_test=recall_test,
    f1_test=f1_test,

    accuracy_unseen=acc_unseen,
    precision_unseen=precision_unseen,
    recall_unseen=recall_unseen,
    f1_unseen=f1_unseen
)
