In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, PredictionErrorDisplay
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
from tabulate import tabulate
import mlflow
from sklearn.linear_model import SGDRegressor


path = 'SAMPLE_DATA_SET.xlsx'

# Read and load dataset
df= pd.read_excel(path, sheet_name=[0, 1])


INPUT DATA

In [2]:
X =df.get(0)
X = (X.iloc[:,1:]).values
X = X.T
print(X.shape)

(839, 444)


OUTPUT DATA

In [3]:
Y =df.get(1)
Y = (Y.iloc[:,:]).values
print(Y.shape)

(839, 2)


**LINEAR REGRESSION with SVD**

In [4]:

mlflow.set_experiment("Linear Regression Demetra")
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

#number of trials
trials = 100
np.random.seed(42)
seeds = np.random.randint(trials, size=trials)

ripartitions = [i/100 for i in range(5,55,5)]
results = []  #["Ripartition", "Best_MSE1", "Best_MSE2","MSE1_mean","MSE2_mean","MSE1_var","MSE2_var"]

# create 5x2 subfigs
fig = plt.figure(constrained_layout=True, figsize=(20, 25))
fig.suptitle('Linear regression test'+"\n")
subfigs = fig.subfigures(nrows=5, ncols=2).flatten()
k = 0

for ripartition in ripartitions:

    '''PARAM 1'''

    MSE_param1 = []

    best_MSE_1 = 100000000
    best_data_1 = {"Y_test":None, "Y_pred":None,"seed":None}

    '''PARAM 2'''
    MSE_param2 = []

    best_MSE_2 = 100000000
    best_data_2 = {"Y_test":None, "Y_pred":None,"seed":None}

    run_name = str(ripartition) + " ripartition"
    with mlflow.start_run(run_name=run_name):

        i = 0
        mlflow.log_param('ripartition', ripartition)
        for i in range(trials):
            #Ripartition in training and test and
            X_train, X_test, Y_train, Y_test = train_test_split(X, Y[:,0], test_size=ripartition, random_state=seeds[i])
            regr = LinearRegression()
    
            # Train the model using the training sets
            regr.fit(X_train, Y_train)
    
            # Make predictions using the testing set
            Y_pred_regr_svd = regr.predict(X_test)
    
            #save MSE of the first parameters
            MSE_param1.append(mean_squared_error(Y_test, Y_pred_regr_svd))
    
            #check if this is the best model in terms of MSE
            if MSE_param1[-1] < best_MSE_1:
              best_MSE_1 = MSE_param1[-1]
              best_data_1["Y_test"] = Y_test
              best_data_1["Y_pred"] = Y_pred_regr_svd
              best_data_1["seed"] = seeds[i]
    
    
            X_train, X_test, Y_train, Y_test = train_test_split(X, Y[:,1], test_size=ripartition, random_state=seeds[i])
            regr = LinearRegression()
    
            # Train the model using the training sets
            regr.fit(X_train, Y_train)
    
            # Make predictions using the testing set
            Y_pred_regr_svd = regr.predict(X_test)
    
            #save MSE of the second parameters
            MSE_param2.append(mean_squared_error(Y_test, Y_pred_regr_svd))
    
            #check if this is the best model in terms of MSE
            if MSE_param2[-1] < best_MSE_2:
              best_MSE_2 = MSE_param2[-1]
              best_data_2["Y_test"] = Y_test
              best_data_2["Y_pred"] = Y_pred_regr_svd
              best_data_2["seed"] = seeds[i]
                
        mlflow.log_metric("Best_MSE1", best_MSE_1)
        mlflow.log_metric("Best_MSE2", best_MSE_2)
        mlflow.log_metric("Mean_MSE1", np.mean(MSE_param1))
        mlflow.log_metric("Mean_MSE2", np.mean(MSE_param2))
        mlflow.log_metric("Var_MSE1", np.var(MSE_param1))
        mlflow.log_metric("Var_MSE2", np.var(MSE_param2))
        
    results.append([str(ripartition * 100), str(best_MSE_1), str(best_MSE_2),
                    (np.mean(MSE_param1)), str(np.mean(MSE_param2)), str(np.var(MSE_param1)), str(np.var(MSE_param2)),
                    best_data_1["seed"],best_data_2["seed"]])

    '''PLOT THE BEST MODELS'''
    fig, axs = plt.subplots(ncols=2, figsize=(10, 5))
    plt.subplots_adjust(wspace=0.3)
    PredictionErrorDisplay.from_predictions(y_true=best_data_1["Y_test"], y_pred=best_data_1["Y_pred"], kind = "actual_vs_predicted", ax=axs[0])
    axs[0].set_title("Parameter 1"+"\n"+ "MSE: " +  str(min(MSE_param1)))
    disp2= PredictionErrorDisplay.from_predictions(y_true=best_data_2["Y_test"], y_pred=best_data_2["Y_pred"], kind = "actual_vs_predicted", ax=axs[1])
    axs[1].set_title("Parameter 2"+"\n"+ "MSE: " +  str(min(MSE_param2)))
    fig.suptitle("Linear Regression with SVD"+"\n"+ "Best MSE results" + "\n"+ f"Test size(%): {ripartition * 100}")
    plt.tight_layout()
    plt.show()


'''SHOW RESULTS'''

headers = ["Test size(%)", "Best(MSE1)", "Best(MSE2)", "Mean(MSE1)", "Mean(MSE2)", "Var(MSE1)", "Var(MSE2)","Seed1","Seed2"]

print(tabulate(results, headers,  tablefmt="double_outline"))


2024/05/07 09:39:24 INFO mlflow.tracking.fluent: Experiment with name 'Linear Regression Demetra' does not exist. Creating a new experiment.


NameError: name 'n_components' is not defined

<Figure size 2000x2500 with 0 Axes>

**LINEAR REGRESSION with SGD**

In [None]:
sgd_reg = SGDRegressor(max_iter=5000, tol=1e-5, penalty=None, eta0=0.01,
        n_iter_no_change=10, random_state=42)

In [None]:

mlflow.set_experiment("Linear Regression Demetra")
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

#number of trials
trials = 100
np.random.seed(42)
seeds = np.random.randint(trials, size=trials)

ripartitions = [i/100 for i in range(5,55,5)]
results = []  #["Ripartition", "Best_MSE1", "Best_MSE2","MSE1_mean","MSE2_mean","MSE1_var","MSE2_var"]

# create 5x2 subfigs
fig = plt.figure(constrained_layout=True, figsize=(20, 25))
fig.suptitle('Linear regression test'+"\n")
subfigs = fig.subfigures(nrows=5, ncols=2).flatten()
k = 0

for ripartition in ripartitions:

    '''PARAM 1'''

    MSE_param1 = []

    best_MSE_1 = 100000000
    best_data_1 = {"Y_test":None, "Y_pred":None,"seed":None}

    '''PARAM 2'''
    MSE_param2 = []

    best_MSE_2 = 100000000
    best_data_2 = {"Y_test":None, "Y_pred":None,"seed":None}

    run_name = str(ripartition) + " ripartition"
    with mlflow.start_run(run_name=run_name):

        i = 0
        mlflow.log_param('ripartition', ripartition)
        for i in range(trials):
            #Ripartition in training and test and
            X_train, X_test, Y_train, Y_test = train_test_split(X, Y[:,0], test_size=ripartition, random_state=seeds[i])
            sgd_reg = SGDRegressor(max_iter=5000, tol=1e-5, penalty=None, eta0=0.01,
            n_iter_no_change=10, random_state=42)
    
            # Train the model using the training sets
            regr.fit(X_train, Y_train)
    
            # Make predictions using the testing set
            Y_pred_regr_svd = regr.predict(X_test)
    
            #save MSE of the first parameters
            MSE_param1.append(mean_squared_error(Y_test, Y_pred_regr_svd))
    
            #check if this is the best model in terms of MSE
            if MSE_param1[-1] < best_MSE_1:
              best_MSE_1 = MSE_param1[-1]
              best_data_1["Y_test"] = Y_test
              best_data_1["Y_pred"] = Y_pred_regr_svd
              best_data_1["seed"] = seeds[i]
    
    
    
        i = 0
        for i in range(trials):
            X_train, X_test, Y_train, Y_test = train_test_split(X, Y[:,1], test_size=ripartition, random_state=seeds[i])
            regr = LinearRegression()
    
            # Train the model using the training sets
            regr.fit(X_train, Y_train)
    
            # Make predictions using the testing set
            Y_pred_regr_svd = regr.predict(X_test)
    
            #save MSE of the second parameters
            MSE_param2.append(mean_squared_error(Y_test, Y_pred_regr_svd))
    
            #check if this is the best model in terms of MSE
            if MSE_param2[-1] < best_MSE_2:
              best_MSE_2 = MSE_param2[-1]
              best_data_2["Y_test"] = Y_test
              best_data_2["Y_pred"] = Y_pred_regr_svd
              best_data_2["seed"] = seeds[i]
                
        mlflow.log_metric("Best_MSE1", best_MSE_1)
        mlflow.log_metric("Best_MSE2", best_MSE_2)
        mlflow.log_metric("Mean_MSE1", np.mean(MSE_param1))
        mlflow.log_metric("Mean_MSE2", np.mean(MSE_param2))
        mlflow.log_metric("Var_MSE1", np.var(MSE_param1))
        mlflow.log_metric("Var_MSE2", np.var(MSE_param2))
        
    results.append([str(ripartition * 100), str(best_MSE_1), str(best_MSE_2),
                    (np.mean(MSE_param1)), str(np.mean(MSE_param2)), str(np.var(MSE_param1)), str(np.var(MSE_param2)),
                    best_data_1["seed"],best_data_2["seed"]])

    '''PLOT THE BEST MODELS'''
    fig, axs = plt.subplots(ncols=2, figsize=(10, 5))
    plt.subplots_adjust(wspace=0.3)
    PredictionErrorDisplay.from_predictions(y_true=best_data_1["Y_test"], y_pred=best_data_1["Y_pred"], kind = "actual_vs_predicted", ax=axs[0])
    axs[0].set_title("Parameter 1"+"\n"+ "MSE: " +  str(min(MSE_param1)))
    disp2= PredictionErrorDisplay.from_predictions(y_true=best_data_2["Y_test"], y_pred=best_data_2["Y_pred"], kind = "actual_vs_predicted", ax=axs[1])
    axs[1].set_title("Parameter 2"+"\n"+ "MSE: " +  str(min(MSE_param2)))
    fig.suptitle("Linear Regression with SVD"+"\n"+ "Best MSE results" + "\n"+ f"Test size(%): {ripartition * 100}")
    plt.tight_layout()
    plt.show()


'''SHOW RESULTS'''

headers = ["Test size(%)", "Best(MSE1)", "Best(MSE2)", "Mean(MSE1)", "Mean(MSE2)", "Var(MSE1)", "Var(MSE2)","Seed1","Seed2"]

print(tabulate(results, headers,  tablefmt="double_outline"))
