In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, PredictionErrorDisplay
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from tabulate import tabulate
from sklearn.cross_decomposition import PLSRegression
import mlflow

path = 'SAMPLE_DATA_SET.xlsx'

# Read and load dataset
df= pd.read_excel(path, sheet_name=[0, 1])

INPUT DATA

In [2]:
X =df.get(0)
X = (X.iloc[:,1:]).values
X = X.T
print(X.shape)

(839, 444)


OUTPUT DATA

In [3]:
Y =df.get(1)
Y = (Y.iloc[:,:]).values
print(Y.shape)

(839, 2)


In [13]:
print(X)

[[0.06417113 0.06483447 0.06550327 ... 0.39466706 0.39299677 0.39129164]
 [0.12033977 0.12280449 0.12538263 ... 0.47134405 0.46753118 0.46345874]
 [0.10807583 0.11070238 0.11337989 ... 0.45610453 0.4542271  0.45227506]
 ...
 [0.20127608 0.20258463 0.20379779 ... 0.40942987 0.41020758 0.41114899]
 [0.20548144 0.20644856 0.20735227 ... 0.43736083 0.43532781 0.43315283]
 [0.2065847  0.20720449 0.20779776 ... 0.43238588 0.42933803 0.42606486]]


In [14]:

mlflow.set_experiment("PLS Demetra")
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")


for n_components in range(2,31):
    # number of trials
    trials = 100
    np.random.seed(42)
    seeds = np.random.randint(trials, size=trials)
    
    ripartisions = [i / 100 for i in range(5, 55, 5)]
    results = []
    
    # create 5x2 subfigs
    #fig = plt.figure(constrained_layout=True, figsize=(20, 25))
    #fig.suptitle('PLS regression test'+"\n"+ f"Number of components: {n_components}"+"\n")
    #subfigs = fig.subfigures(nrows=5, ncols=2).flatten()
    k = 0
    
    for ripartition in ripartisions:
        
        '''PARAM 1'''
    
        MSE_param1 = []
    
        best_MSE_1 = 100000000
        best_data_1 = {"Y_test": None, "Y_pred": None,"seed":None}
    
        '''PARAM 2'''
        MSE_param2 = []
    
        best_MSE_2 = 100000000
        best_data_2 = {"Y_test": None, "Y_pred": None,"seed":None}
        
        name_experiment = str(n_components) + " components " + str(ripartition) + " ripartition"
        with mlflow.start_run(run_name=name_experiment):
            i = 0
            mlflow.log_param('ripartition', ripartition)
            mlflow.log_param('no. components', n_components)
            for i in range(trials):
                # Tell mlflow to log the following parameters for the experiments dashboard
                # Ripartition in training and test and
                X_train, X_test, Y_train, Y_test = train_test_split(X, Y[:, 0], test_size=ripartition, random_state=seeds[i])
                pls_model = PLSRegression(n_components=n_components)
        
                pls_model.fit(X_train, Y_train)
        
                # Make predictions using the testing set
                Y_pred_pls = pls_model.predict(X_test)
        
                # save MSE of the first parameters
                MSE_param1.append(mean_squared_error(Y_test, Y_pred_pls))
        
                # check if this is the best model in terms of MSE
                if MSE_param1[-1] < best_MSE_1:
                    best_MSE_1 = MSE_param1[-1]
                    best_data_1["Y_test"] = Y_test
                    best_data_1["Y_pred"] = Y_pred_pls
                    best_data_1["seed"] = seeds[i]
        
                X_train, X_test, Y_train, Y_test = train_test_split(X, Y[:, 1], test_size=ripartition, random_state=seeds[i])
                pls_model = PLSRegression(n_components=n_components)
        
                pls_model.fit(X_train, Y_train)
        
                # Make predictions using the testing set
                Y_pred_pls= pls_model.predict(X_test)
        
                # save MSE of the second parameters
                MSE_param2.append(mean_squared_error(Y_test, Y_pred_pls))
        
                # check if this is the best model in terms of MSE
                if MSE_param2[-1] < best_MSE_2:
                    best_MSE_2 = MSE_param2[-1]
                    best_data_2["Y_test"] = Y_test
                    best_data_2["Y_pred"] = Y_pred_pls
                    best_data_2["seed"] = seeds[i]
        
            
            
            mlflow.log_metric("Best_MSE1", best_MSE_1)
            mlflow.log_metric("Best_MSE2", best_MSE_2)
            mlflow.log_metric("Mean_MSE1", np.mean(MSE_param1))
            mlflow.log_metric("Mean_MSE2", np.mean(MSE_param2))
            mlflow.log_metric("Var_MSE1", np.var(MSE_param1))
            mlflow.log_metric("Var_MSE2", np.var(MSE_param2))
            
        results.append([str(ripartition * 100), str(best_MSE_1), str(best_MSE_2),
                        (np.mean(MSE_param1)), str(np.mean(MSE_param2)), str(np.var(MSE_param1)), str(np.var(MSE_param2)),
                         best_data_1["seed"],best_data_2["seed"]])
    
            
        '''PLOT THE BEST MODELS'''
        '''
        subfigs[k].suptitle("PLS Regression" + "\n" + "Best MSE results" + "\n" + f"Test size(%): {ripartition * 100}")
        # create 1x2 subplots per subfig
        axs = subfigs[k].subplots(nrows=1, ncols=2)
        axs[0].scatter(best_data_1["Y_test"], best_data_1["Y_pred"], c='blue', label='Actual vs Predicted')
        axs[0].plot([min(best_data_1["Y_test"]), max(best_data_1["Y_test"])],
                    [min(best_data_1["Y_test"]), max(best_data_1["Y_test"])], '--', c='red', label='Perfect Prediction')
        axs[0].legend()
        axs[0].set_xlabel("Actual values")
        axs[0].set_ylabel("Predicted value")
        axs[0].set_title("Parameter 1"+"\n"+ "MSE: "+ str(best_MSE_1))
        axs[1].scatter(best_data_2["Y_test"], best_data_2["Y_pred"], c='blue', label='Actual vs Predicted')
        axs[1].plot([min(best_data_2["Y_test"]), max(best_data_2["Y_test"])],
                    [min(best_data_2["Y_test"]), max(best_data_2["Y_test"])], '--', c='red', label='Perfect Prediction')
        axs[1].legend()
        axs[1].set_xlabel("Actual values")
        axs[1].set_ylabel("Predicted value")
        axs[1].set_title("Parameter 2" + "\n"+ "MSE: "+ str(best_MSE_2))
        k += 1
        '''
    '''
    '''SHOW RESULTS'''
    #headers = ["Test size(%)", "Best(MSE1)", "Best(MSE2)", "Mean(MSE1)", "Mean(MSE2)", "Var(MSE1)", "Var(MSE2)","Seed1","Seed2"]
    
    #print(tabulate(results, headers, tablefmt="double_outline"))



IndentationError: unexpected indent (1298904329.py, line 93)