In [22]:
import pandas as pd
from sklearn.metrics import mean_squared_error, PredictionErrorDisplay
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
from sklearn.linear_model import Ridge
from tabulate import tabulate
from sklearn_genetic import GASearchCV
from sklearn_genetic.space import Categorical, Integer, Continuous

path = 'SAMPLE_DATA_SET.xlsx'

# Read and load dataset
df= pd.read_excel(path, sheet_name=[0, 1])


INPUT DATA

In [7]:
X =df.get(0)
X = (X.iloc[:,1:]).values
X = X.T
print(X.shape)

(839, 444)


OUTPUT DATA

In [8]:
Y =df.get(1)
Y = (Y.iloc[:,:]).values
print(Y.shape)

(839, 2)


RIDGE REGRESSION with closed form solution

In [9]:
a1 = 0.01
a2 = 0.001

In [32]:
#number of trials
trials = 100
np.random.seed(42)
seeds = np.random.randint(trials, size=trials)

ripartisions = [i/100 for i in range(5,55,5)]
results = []  #["Ripartition", "Best_MSE1", "Best_MSE2","MSE1_mean","MSE2_mean","MSE1_var","MSE2_var","alpha1","alpha2"]


for ripartition in ripartisions:

    '''PARAM 1'''

    MSE_param1 = []

    best_MSE_1 = 100000000
    best_data_1 = {"Y_test":None, "Y_pred":None,"alpha":None}

    i = 0
    for i in range(trials):
        #Ripartition in training and test and
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y[:,0], test_size=ripartition, random_state=seeds[i])
        param_grid = {'alpha': Continuous(1/100000,0.01)}
        ridge_reg_cl = Ridge(solver="cholesky")
        
        evolved_estimator = GASearchCV(estimator=ridge_reg_cl,
                              cv=5,
                              scoring='neg_mean_squared_error',
                              param_grid=param_grid,
                              n_jobs=-1,
                              verbose=True,
                              population_size=10,
                              generations=10)

        evolved_estimator.fit(X_train, Y_train)

        # Make predictions using the testing set
        Y_pred_ridge_cl =  evolved_estimator.predict(X_test)

        #save MSE of the first parameters
        MSE_param1.append(mean_squared_error(Y_test, Y_pred_ridge_cl))

        #check if this is the best model in terms of MSE
        if MSE_param1[-1] < best_MSE_1:
          best_MSE_1 = MSE_param1[-1]
          best_data_1["Y_test"] = Y_test
          best_data_1["Y_pred"] = Y_pred_ridge_cl
          best_data_1["alpha"] =  float(evolved_estimator.best_params_["alpha"])


    '''PARAM 2'''
    MSE_param2 = []

    best_MSE_2 = 100000000
    best_data_2 = {"Y_test":None, "Y_pred":None,"alpha":None}

    i = 0
    for i in range(trials):
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y[:,1], test_size=ripartition, random_state=seeds[i])
        param_grid = param_grid = {'alpha': Continuous(1/100000,0.01)}
        ridge_reg_cl = Ridge(solver="cholesky")

        evolved_estimator = GASearchCV(estimator=ridge_reg_cl,
                              cv=5,
                              scoring='neg_mean_squared_error',
                              param_grid=param_grid,
                              n_jobs=-1,
                              verbose=True,
                              population_size=10,
                              generations=10)

        evolved_estimator.fit(X_train, Y_train)

        # Make predictions using the testing set
        Y_pred_ridge_cl =  evolved_estimator.predict(X_test)
        #save MSE of the second parameters
        MSE_param2.append(mean_squared_error(Y_test, Y_pred_ridge_cl))

        #check if this is the best model in terms of MSE
        if MSE_param2[-1] < best_MSE_2:
          best_MSE_2 = MSE_param2[-1]
          best_data_2["Y_test"] = Y_test
          best_data_2["Y_pred"] = Y_pred_ridge_cl
          best_data_2["alpha"] =  float(evolved_estimator.best_params_["alpha"])


    results.append([str(ripartition *100 ), str(best_MSE_1), str(best_MSE_2),
                    (np.mean(MSE_param1)),str(np.mean(MSE_param2)), str(np.var(MSE_param1)),
                    str(np.var(MSE_param2)), str(np.var(MSE_param2)), str(best_data_1["alpha"]), str(best_data_2["alpha"])])

    '''PLOT THE BEST MODELS'''
    fig, axs = plt.subplots(ncols=2, figsize=(10, 5))
    plt.subplots_adjust(wspace=0.3)
    PredictionErrorDisplay.from_predictions(y_true=best_data_1["Y_test"], y_pred=best_data_1["Y_pred"], kind = "actual_vs_predicted", ax=axs[0])
    axs[0].set_title("Parameter 1"+"\n"+ "MSE: " +  str(min(MSE_param1)) + "\n" + " alpha= "+ str(best_data_1["alpha"]))
    disp2= PredictionErrorDisplay.from_predictions(y_true=best_data_2["Y_test"], y_pred=best_data_2["Y_pred"], kind = "actual_vs_predicted", ax=axs[1])
    axs[1].set_title("Parameter 2"+"\n"+ "MSE: " +  str(min(MSE_param2)) + "\n" + " alpha= "+ str(best_data_2["alpha"]))
    fig.suptitle("Ridge Regression with closed form"+"\n"+ "Best MSE results" + "\n"+ f"Test size(%): {ripartition * 100}")
    plt.tight_layout()
    plt.show()


'''SHOW RESULTS'''
headers = ["Test size(%)","Best(MSE1)", "Best(MSE2)","Mean(MSE1)", "Mean(MSE2)", "Var(MSE1)", "Var(MSE2)","alpha1","alpha2"]

print(tabulate(results, headers,  tablefmt="double_outline"))





gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	10    	-2.0755	0.0418157  	-2.05335   	-2.19696   
1  	20    	-2.05341	8.46817e-05	-2.05335   	-2.05363   
2  	20    	-2.05361	0.0005051  	-2.05335   	-2.05462   
3  	20    	-2.05336	1.15805e-06	-2.05336   	-2.05336   
4  	20    	-2.05376	0.000837317	-2.05336   	-2.05602   
5  	20    	-2.05381	0.000530987	-2.05336   	-2.05459   
6  	20    	-2.05407	0.00152975 	-2.05336   	-2.05862   
7  	20    	-2.05439	0.00195438 	-2.05336   	-2.05862   
8  	20    	-2.05344	8.06566e-05	-2.05335   	-2.05352   
9  	20    	-2.05342	7.46397e-05	-2.05335   	-2.05355   
10 	20    	-2.05341	7.586e-05  	-2.05335   	-2.05359   




gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	10    	-1.96982	0.0230355  	-1.95932   	-2.03863   
1  	20    	-1.96052	0.0012619  	-1.95932   	-1.96313   


Exception ignored in: <function _releaseLock at 0x7cc0adf1c310>
Traceback (most recent call last):
  File "/usr/lib/python3.10/logging/__init__.py", line 228, in _releaseLock
    def _releaseLock():
KeyboardInterrupt: 
Exception ignored in: Fatal Python error: init_import_site: Failed to import the site module
Python runtime state: initialized
<function _releaseLock at 0x7cc0adf1c310>
Traceback (most recent call last):
  File "/usr/lib/python3.10/logging/__init__.py", line 228, in _releaseLock
    def _releaseLock():
KeyboardInterrupt: 
Traceback (most recent call last):
  File "/usr/lib/python3.10/site.py", line 636, in <module>
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 187, in _run_module_as_main
  File "/usr/lib/python3.10/runpy.py", line 187, in _run_module_as_main
    mod_name, mod_spec, code = _get_module_details(mod_name, _Error)
  File "/usr/lib/python3.10/runpy.py", line 110, in _get_module_details
    mod

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGINT(-2), SIGINT(-2), EXIT(1), SIGINT(-2)}

**RIDGE REGRESSION with SGD**

In [None]:
from sklearn.linear_model import SGDRegressor

#number of trials
trials = 100
np.random.seed(42)
seeds = np.random.randint(trials, size=trials)

ripartisions = [i/100 for i in range(5,55,5)]
results = []  #["Ripartition", "Best_MSE1", "Best_MSE2","MSE1_mean","MSE2_mean","MSE1_var","MSE2_var","alpha1","alpha2"]


for ripartition in ripartisions:

    '''PARAM 1'''

    MSE_param1 = []

    best_MSE_1 = 100000000
    best_data_1 = {"Y_test":None, "Y_pred":None,"alpha":None}

    i = 0
    for i in range(trials):
        #Ripartition in training and test and
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y[:,0], test_size=ripartition, random_state=seeds[i])
        param_grid = {'alpha': Continuous(1/100000,0.01)}
        ridge_reg_sgd = SGDRegressor(max_iter=5000, tol=1e-5, penalty="l2", eta0=0.01,
                                            n_iter_no_change=10, random_state=42)

        evolved_estimator = GASearchCV(estimator=ridge_reg_cl,
                              cv=5,
                              scoring='neg_mean_squared_error',
                              param_grid=param_grid,
                              n_jobs=-1,
                              verbose=True,
                              population_size=10,
                              generations=10)

        evolved_estimator.fit(X_train, Y_train)

        # Make predictions using the testing set
        Y_pred_ridge_sgd =  evolved_estimator.predict(X_test)

        #save MSE of the first parameters
        MSE_param1.append(mean_squared_error(Y_test, Y_pred_ridge_sgd))

        #check if this is the best model in terms of MSE
        if MSE_param1[-1] < best_MSE_1:
          best_MSE_1 = MSE_param1[-1]
          best_data_1["Y_test"] = Y_test
          best_data_1["Y_pred"] = Y_pred_ridge_sgd
          best_data_1["alpha"] =  float(evolved_estimator.best_params_["alpha"])


    '''PARAM 2'''
    MSE_param2 = []

    best_MSE_2 = 100000000
    best_data_2 = {"Y_test":None, "Y_pred":None,"alpha":None}

    i = 0
    for i in range(trials):
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y[:,1], test_size=ripartition, random_state=seeds[i])
        param_grid = {'alpha': Continuous(1/100000,0.01)}
        ridge_reg_sgd = SGDRegressor(max_iter=5000, tol=1e-5, penalty="l2", eta0=0.01,
                                            n_iter_no_change=10, random_state=42)

        evolved_estimator = GASearchCV(estimator=ridge_reg_cl,
                              cv=5,
                              scoring='neg_mean_squared_error',
                              param_grid=param_grid,
                              n_jobs=-1,
                              verbose=True,
                              population_size=10,
                              generations=10)

        evolved_estimator.fit(X_train, Y_train)

        # Make predictions using the testing set
        Y_pred_ridge_sgd =  evolved_estimator.predict(X_test)

        #save MSE of the second parameters
        MSE_param2.append(mean_squared_error(Y_test, Y_pred_ridge_sgd))

        #check if this is the best model in terms of MSE
        if MSE_param2[-1] < best_MSE_2:
          best_MSE_2 = MSE_param2[-1]
          best_data_2["Y_test"] = Y_test
          best_data_2["Y_pred"] = Y_pred_ridge_sgd
          best_data_2["alpha"] =  float(evolved_estimator.best_params_["alpha"])

    results.append([str(ripartition *100 ), str(best_MSE_1), str(best_MSE_2),
     (np.mean(MSE_param1)),str(np.mean(MSE_param2)), str(np.var(MSE_param1)), str(np.var(MSE_param2)),str(best_data_1["alpha"]),str(best_data_2["alpha"])])

    '''PLOT THE BEST MODELS'''
    fig, axs = plt.subplots(ncols=2, figsize=(10, 5))
    plt.subplots_adjust(wspace=0.3)
    PredictionErrorDisplay.from_predictions(y_true=best_data_1["Y_test"], y_pred=best_data_1["Y_pred"], kind = "actual_vs_predicted", ax=axs[0])
    axs[0].set_title("Parameter 1"+"\n"+ "MSE: " +  str(min(MSE_param1)) + "\n" + " alpha= "+ str(best_data_1["alpha"]))
    disp2= PredictionErrorDisplay.from_predictions(y_true=best_data_2["Y_test"], y_pred=best_data_2["Y_pred"], kind = "actual_vs_predicted", ax=axs[1])
    axs[1].set_title("Parameter 2"+"\n"+ "MSE: " +  str(min(MSE_param2)) + "\n" + " alpha= "+ str(best_data_2["alpha"]))
    fig.suptitle("Ridge Regression with SGD"+"\n"+ "Best MSE results" + "\n"+ f"Test size(%): {ripartition * 100}")
    plt.tight_layout()
    plt.show()


'''SHOW RESULTS'''
headers = ["Test size(%)","Best(MSE1)", "Best(MSE2)","Mean(MSE1)", "Mean(MSE2)", "Var(MSE1)", "Var(MSE2)","alpha1","alpha2"]

print(tabulate(results, headers,  tablefmt="double_outline"))

