In [None]:
#Importing necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os
import time

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from openpyxl import load_workbook

In [None]:
#Reading the excel-document into a df
df = pd.read_excel('dataset_name.xlsx')

#dropping unwanted columns from the df
df2=df.drop(columns = ["Column 1", "Column 2"])

In [None]:
#Splitting df into two based on ID
df2["ID"] = df2["ID"].astype(str)  #Making sure its a string
df_train = df2[df2["ID"].isin(["ID 1", "ID 2", "ID 3"])]
df_unseen = df2[df2["ID"] == "ID 4"]

In [None]:
#Preparing PCA input by defining the columns to include in the transformation
columns = ['Column 1', 'Column 2', 'Column 3', 'Column 4', 'Column 5']

#Extracting and copying the selected columns from the training dataframe
data_train = df_train[columns].copy()

#Encoding 'ID' 
data_train['ID'] = data_train['ID'].astype('Category_column').cat.codes

#Initializing a standard scaler to normalize the data
scaler_pca = StandardScaler()

#Fitting the scaler on training data and apply the transformation
data_train_scaled = scaler_pca.fit_transform(data_train)

#Initializing PCA to reduce dimensionality to 4 PCs
pca = PCA(n_components=4)

#Fitting PCA on the scaled training data and transforming it
pca_train = pca.fit_transform(data_train_scaled)

#Adding the first 4 principal components (PC1 to PC4) as new columns to df_train
for i in range(4):
    df_train.loc[:, f'PC{i+1}'] = pca_train[:, i]


#Applying the same PCA transformation to the unseen dataset 

#Extracting and copying the same columns from the unseen dataset
data_unseen = df_unseen[columns].copy()

#Encode 'ID' 
data_unseen['ID'] = data_unseen['ID'].astype('Category_column').cat.codes

#Applying the same scaling 
data_unseen_scaled = scaler_pca.transform(data_unseen)

#Applying the same PCA transformation
pca_unseen = pca.transform(data_unseen_scaled)

#Adding PC1 to PC4 as new columns to df_unseen
for i in range(4):
    df_unseen.loc[:, f'PC{i+1}'] = pca_unseen[:, i]

In [None]:
#Preparing inputs and target values
X = df_train[['PC1', 'PC2', 'PC3', 'PC4']]
y = df_train['Category_column']

X_unseen = df_unseen[['PC1', 'PC2', 'PC3', 'PC4']]
y_unseen = df_unseen['Category_column']

In [None]:
#Setting up K-Fold cross-validation 
k_fold = KFold(n_splits=10, random_state=66, shuffle=True)

#Defining a pipeline that first standardizes the features, then applies Linear Regression
pipeline = Pipeline([
    ('scaler', StandardScaler()),       
    ('mlr', LinearRegression())         
])

#Creating empty lists to store training and testing metrics from each fold
mae_scores_train, mse_scores_train, r2_scores_train = [], [], []
mae_scores_test, mse_scores_test, r2_scores_test = [], [], []

#Iterating through each fold's train/test split
for train_index, test_index in k_fold.split(X, y):
    #Splitting data into training and testing sets for this fold
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    #Skipping this fold if the test set contains only one unique value
    if len(np.unique(y_test)) == 1:
        print("Skipping this fold due to only one class in test set.")
        continue

    #Fitting the pipeline on the train data
    pipeline.fit(X_train, y_train)

    #Predicting on both training and test sets
    pred_train = pipeline.predict(X_train)
    pred_test = pipeline.predict(X_test)

    #Calculating regression metrics on the training data
    mae_train = mean_absolute_error(y_train, pred_train)  
    mse_train = mean_squared_error(y_train, pred_train)   
    r2_train = r2_score(y_train, pred_train)              

    #Computing regression metrics on the test data
    mae_test = mean_absolute_error(y_test, pred_test)
    mse_test = mean_squared_error(y_test, pred_test)
    r2_test = r2_score(y_test, pred_test)

    #Appending results to the lists
    mae_scores_train.append(mae_train)
    mse_scores_train.append(mse_train)
    r2_scores_train.append(r2_train)

    mae_scores_test.append(mae_test)
    mse_scores_test.append(mse_test)
    r2_scores_test.append(r2_test)

    #Printing metrics for this fold
    print(f"Train MAE: {mae_train:.6f}, MSE: {mse_train:.6f}, R²: {r2_train:.6f}")
    print(f"Test MAE: {mae_test:.6f}, MSE: {mse_test:.6f}, R²: {r2_test:.6f}\n")

#After completing all folds, compute and print average performance metrics
print("\nAverage results for all folds:")
print(f"Train Mean MAE: {np.mean(mae_scores_train):.6f}, Test Mean MAE: {np.mean(mae_scores_test):.6f}")
print(f"Train Mean MSE: {np.mean(mse_scores_train):.6f}, Test Mean MSE: {np.mean(mse_scores_test):.6f}")
print(f"Train Mean R²: {np.mean(r2_scores_train):.6f}, Test Mean R²: {np.mean(r2_scores_test):.6f}")


In [None]:
#Defining a pipeline for the model
pipeline = Pipeline([
    ('scaler', StandardScaler()),                 
    ('poly', PolynomialFeatures(degree=2)),     
    ('reg', Ridge())                             
])

#Defining a hyperparameter grid to tune parameters
parameters_grid = {
    'poly__degree': [1, 2, 3],                    
    'reg__alpha': [5, 7],                        
    'reg__fit_intercept': [True],                
    'reg__solver': ['cholesky', 'lsqr', 'auto']  
}

#Useing GridSearchCV to search for the best parameter combination
CV_MLR = GridSearchCV(
    estimator=pipeline,             
    param_grid=parameters_grid,     
    cv=k_fold,                      
    scoring='r2',                   
    n_jobs=-1)

#Fitting GridSearchCV to the full dataset
CV_MLR.fit(X, y)

#Printing the best parameter combination found during the grid search
print('Best parameters: ', CV_MLR.best_params_)


In [None]:
#Getting the best-performing pipeline
best_pipeline = CV_MLR.best_estimator_

#Getting best hyperparameters
best_params = CV_MLR.best_params_

#Creating lists to save R² and MSE metrics for each fold
r2_train_scores = []
r2_test_scores = []
mse_train_scores = []
mse_test_scores = []

#Performing evaluation using the same K-Fold splits
for train_idx, test_idx in k_fold.split(X, y):
    #Splitting data into training and test sets for the current fold
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    #Refitting the best pipeline
    best_pipeline.fit(X_train, y_train)

    #Predicting target values for both training and test sets
    y_train_pred = best_pipeline.predict(X_train)
    y_test_pred = best_pipeline.predict(X_test)

    #Calculating and saving evaluation metrics for this fold
    r2_train_scores.append(r2_score(y_train, y_train_pred))             
    r2_test_scores.append(r2_score(y_test, y_test_pred))               
    mse_train_scores.append(mean_squared_error(y_train, y_train_pred)) 
    mse_test_scores.append(mean_squared_error(y_test, y_test_pred))    

#Printing the average R² and MSE across all folds for both training and test sets
print(f"Train R²: {np.mean(r2_train_scores):.4f}, MSE: {np.mean(mse_train_scores):.4f}")
print(f"Test  R²: {np.mean(r2_test_scores):.4f}, MSE: {np.mean(mse_test_scores):.4f}")


In [None]:
#Refitting the best model on the full training dataset 
best_pipeline.fit(X, y)

#Using the refitted model to make predictions on the unseen dataset
y_unseen_pred = best_pipeline.predict(X_unseen)

#Computing the Mean Squared Error and R2 score on the unseen data
mse_unseen = mean_squared_error(y_unseen, y_unseen_pred)
r2_unseen = r2_score(y_unseen, y_unseen_pred)

#Printing the evaluation results
print(f'Unseen Mean Squared Error: {mse_unseen:.4f}')
print(f'Unseen R-squared: {r2_unseen:.4f}')

In [None]:
avg_mse_train = np.mean(mse_train_scores)
avg_r2_train = np.mean(r2_train_scores)
avg_mse_test = np.mean(mse_test_scores)
avg_r2_test = np.mean(r2_test_scores)

In [None]:
#Defining a function to log model evaluation metrics and hyperparameters to an Excel-file of chosen name
def log_results(model_name, params, mse_train, r2_train, mse_test, r2_test, mse_unseen, r2_unseen, filename="Results.xlsx"):
    """ Logs model results in separate sheets within the same Excel file, ensuring appending works correctly. """

    #Creating DataFrame for this model run
    result = pd.DataFrame([{
        **params, 
        "MSE_Train": mse_train,
        "R2_Train": r2_train,
        "MSE_Test": mse_test,
        "R2_Test": r2_test,
        "MSE_Unseen": mse_unseen,
        "R2_Unseen": r2_unseen
    }])

    #Introduceinga short delay to avoid file conflicts if running in multiple notebooks
    time.sleep(1)

    #Checking if the file exists
    file_exists = os.path.exists(filename)

    if not file_exists:
        #If file doesn't exist, creating a new one
        with pd.ExcelWriter(filename, engine="openpyxl", mode="w") as writer:
            result.to_excel(writer, sheet_name=model_name, index=False)
    else:
        #If file exists, loading it properly before appending
        try:
            with pd.ExcelWriter(filename, engine="openpyxl", mode="a", if_sheet_exists="overlay") as writer:
                #Reading existing sheet
                try:
                    existing_df = pd.read_excel(filename, sheet_name=model_name, engine="openpyxl")
                    df_combined = pd.concat([existing_df, result], ignore_index=True)
                except (FileNotFoundError, ValueError):
                    df_combined = result  #If sheet does not exist, creating it

                #Saving results, ensuring correct appending
                df_combined.to_excel(writer, sheet_name=model_name, index=False)

        except PermissionError:
            #Printing error to warn user
            print(f"Error: Close the Excel file ({filename}) before running the script again.")

    #Printing a confirmation to ensure user results are logged
    print(f"Logged results for {model_name}: Train MSE={mse_train:.4f}, Test MSE={mse_test:.4f}, Unseen MSE={mse_unseen:.4f}")


In [None]:
#Using the function to log results from MLR model
log_results(
    model_name="MLR_PCA",
    params=best_params,
    mse_train=avg_mse_train,
    r2_train=avg_r2_train,
    mse_test=avg_mse_test,
    r2_test=avg_r2_test,
    mse_unseen=mse_unseen,
    r2_unseen=r2_unseen
)