In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, PredictionErrorDisplay
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from tabulate import tabulate
from sklearn.cross_decomposition import PLSRegression
import mlflow

path = 'SAMPLE_DATA_SET.xlsx'

# Read and load dataset
df= pd.read_excel(path, sheet_name=[0, 1])



INPUT DATA

In [2]:
X =df.get(0)
X = (X.iloc[:,1:]).values
X = X.T
print(X.shape)

(839, 444)


OUTPUT DATA

In [3]:
Y =df.get(1)
Y = (Y.iloc[:,:]).values
print(Y.shape)

(839, 2)


In [None]:

mlflow.set_experiment("PLS Demetra Parameter 1")
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

'''PARAM 1'''
print("Param 1")
for n_components in range(2,31):
    # number of trials
    trials = 100
    np.random.seed(42)
    seeds = np.random.randint(trials, size=trials)
    
    ripartisions = [i / 100 for i in range(5, 55, 5)]
    print(str(n_components) + " components" )
    for ripartition in ripartisions:
        
        MSE_param1 = []   
        best_MSE_1 = 100000000
        best_data_1 = {"Y_test": None, "Y_pred": None}
    
        run_name = str(n_components) + " components " + str(ripartition) + " ripartition"
        with mlflow.start_run(run_name=run_name):
            i = 0
            mlflow.log_param('ripartition', ripartition)
            mlflow.log_param('no. components', n_components)
            for i in range(trials):
                # Tell mlflow to log the following parameters for the experiments dashboard
                # Ripartition in training and test and
                X_train, X_test, Y_train, Y_test = train_test_split(X, Y[:, 0], test_size=ripartition, random_state=seeds[i])
                pls_model = PLSRegression(n_components=n_components)
        
                pls_model.fit(X_train, Y_train)
        
                # Make predictions using the testing set
                Y_pred_pls = pls_model.predict(X_test)
        
                # save MSE of the first parameters
                MSE_param1.append(mean_squared_error(Y_test, Y_pred_pls))
        
                # check if this is the best model in terms of MSE
                if MSE_param1[-1] < best_MSE_1:
                    best_MSE_1 = MSE_param1[-1]
                    best_data_1["Y_test"] = Y_test
                    best_data_1["Y_pred"] = Y_pred_pls
                    
            mlflow.log_metric("Best_MSE1", best_MSE_1)
            mlflow.log_metric("Mean_MSE1", np.mean(MSE_param1))
            mlflow.log_metric("Var_MSE1", np.var(MSE_param1))


mlflow.set_experiment("PLS Demetra Parameter 2")
'''PARAM 2'''
print("Param 2")
for n_components in range(2,31):
    # number of trials
    trials = 100
    np.random.seed(42)
    seeds = np.random.randint(trials, size=trials)
    print(str(n_components) + " components" )
    ripartisions = [i / 100 for i in range(5, 55, 5)]
    
    for ripartition in ripartisions:

        MSE_param2 = []
        best_MSE_2 = 100000000
        best_data_2 = {"Y_test": None, "Y_pred": None}
        run_name = str(n_components) + " components " + str(ripartition) + " ripartition"
        with mlflow.start_run(run_name=run_name):
            i = 0
            mlflow.log_param('ripartition', ripartition)
            mlflow.log_param('no. components', n_components)
            for i in range(trials):
        
                X_train, X_test, Y_train, Y_test = train_test_split(X, Y[:, 1], test_size=ripartition, random_state=seeds[i])
                pls_model = PLSRegression(n_components=n_components)
        
                pls_model.fit(X_train, Y_train)
        
                # Make predictions using the testing set
                Y_pred_pls= pls_model.predict(X_test)
        
                # save MSE of the second parameters
                MSE_param2.append(mean_squared_error(Y_test, Y_pred_pls))
        
                # check if this is the best model in terms of MSE
                if MSE_param2[-1] < best_MSE_2:
                    best_MSE_2 = MSE_param2[-1]
                    best_data_2["Y_test"] = Y_test
                    best_data_2["Y_pred"] = Y_pred_pls
        
            
            mlflow.log_metric("Best_MSE2", best_MSE_2)
            mlflow.log_metric("Mean_MSE2", np.mean(MSE_param2))
            mlflow.log_metric("Var_MSE2", np.var(MSE_param2))
            


2024/05/09 18:05:39 INFO mlflow.tracking.fluent: Experiment with name 'PLS Demetra Parameter 1' does not exist. Creating a new experiment.


Param 1
2 components
3 components
4 components
5 components
6 components
7 components
8 components
9 components
10 components
11 components
12 components
13 components
14 components
15 components
16 components
17 components
18 components
19 components
20 components
21 components
22 components
23 components
24 components
25 components
26 components
27 components
28 components
29 components
30 components


2024/05/09 18:18:04 INFO mlflow.tracking.fluent: Experiment with name 'PLS Demetra Parameter 2' does not exist. Creating a new experiment.


Param 2
2 components
3 components
4 components
5 components
6 components
7 components
8 components
9 components
10 components
11 components
12 components
13 components
