In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tabulate import tabulate
import warnings
from sklearn_genetic import GASearchCV
from sklearn_genetic.space import Categorical, Integer, Continuous
from sklearn.metrics import make_scorer
from sklearn.linear_model import Ridge,Lasso, ElasticNet
from sklearn.model_selection import train_test_split,  cross_validate
from sklearn.preprocessing import StandardScaler
import time
import sys, os
from sklearn.pipeline import make_pipeline
from skopt import BayesSearchCV
import skopt.space.space
from sklearn.model_selection import KFold


path = 'datasets/Dataset_2.xlsx'

# Read and load dataset
df= pd.read_excel(path, sheet_name="preproc")


  from .autonotebook import tqdm as notebook_tqdm


INPUT DATA

In [2]:
X = (df.iloc[:,14:]).values
print(X.shape)
std_scaler = StandardScaler()

(2905, 530)


OUTPUT DATA

In [3]:
Y = (df.iloc[:,7]).values
print(Y.shape)

(2905,)


**NRMSEPiqr**

In [4]:


def NRMSEPiqr(observed_values, predicted_values):
    # Calculate RMSEP
    rmsep = np.sqrt(np.mean((observed_values - predicted_values) ** 2))
    # Calculate Q1 (25th percentile) and Q3 (75th percentile)
    Q1 = np.percentile(observed_values, 25)
    Q3 = np.percentile(observed_values, 75)

    # Calculate IQR
    IQR = Q3 - Q1

    return rmsep/IQR

NRMSEPiqrscorer = make_scorer(NRMSEPiqr,greater_is_better=False)


**RIDGE**

In [5]:



if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore" 

#number of trials
trials = 100
ranndom_cv = np.random.seed(777)
seeds = np.random.choice(trials + 1, size=trials, replace= False)

ripartitions = [0.25,0.5,0.75]

results_ridge = []  


for ripartition in ripartitions:

        run_name = str(ripartition) + " ripartition"
        print(run_name)
        for i in range(trials):
            result = {"Ripartition":None,
                       "Trial":None,
                         "alpha_bayesian":None,
                           #"alpha_evolutionary":None,
                             "bayesian_time":None,
                             #"evolutionary_time" :None,
                             "NRMSEPiqr_bayesian":None,
                              #"NRMSEPiqr_evolutionary":None,
                              "seed": None}
            

            print("Trial:",i+1)
                
            result["Ripartition"] = ripartition
            result["Trial"] = i +1
            result["seed"] = seeds[i]
            #Ripartition in training and test and
            X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=ripartition, random_state=seeds[i])
            param_grid = {'ridge__alpha': Continuous(1/100000,0.01)}
            #sturges = int(1 + np.log(len(X_train)))    
            ridge_reg_pipeline = make_pipeline(StandardScaler(),Ridge())

            kf = KFold(n_splits=5, shuffle=True,random_state=ranndom_cv)
            '''
            evolved_estimator = GASearchCV(estimator=ridge_reg_pipeline,
                                  cv=kf,
                                  scoring=NRMSEPiqrscorer,
                                  param_grid=param_grid,
                                  n_jobs=-1,
                                  verbose=False,
                                  population_size=10,
                                  generations=10)
            '''
            bayesian_estimator = BayesSearchCV(
                                ridge_reg_pipeline,
                                {'ridge__alpha': skopt.space.space.Real(1/100000,0.01,'log-uniform')},
                                cv=kf,
                                scoring=NRMSEPiqrscorer,
                                n_jobs=-1,
                                verbose= False,
                                 n_iter=15 )
            '''
            start_time = time.time()
            evolved_estimator.fit(X_train, Y_train)
            elapsed_time = (time.time() -start_time) #elapsed time in seconds

            
            result["evolutionary_time"] = elapsed_time
            result["alpha_evolutionary"] = float(evolved_estimator.best_params_["ridge__alpha"])
            '''
            start_time = time.time()
            bayesian_estimator.fit(X_train, Y_train)
            elapsed_time = (time.time() -start_time) #elapsed time in seconds

            result["bayesian_time"] = elapsed_time
            result["alpha_bayesian"] = float(bayesian_estimator.best_params_["ridge__alpha"])
    
           
            #result["NRMSEPiqr_evolutionary"] = NRMSEPiqr(Y_test, evolved_estimator.predict(X_test))
    

            result["NRMSEPiqr_bayesian"] =  NRMSEPiqr(Y_test,bayesian_estimator.predict(X_test))
    
            print(result)
            results_ridge.append(result)




0.25 ripartition
Trial: 1
{'Ripartition': 0.25, 'Trial': 1, 'alpha_bayesian': 0.0023986924534758806, 'bayesian_time': 3.5229110717773438, 'NRMSEPiqr_bayesian': 0.29837444660190915, 'seed': 26}
Trial: 2
{'Ripartition': 0.25, 'Trial': 2, 'alpha_bayesian': 0.00014018343417222985, 'bayesian_time': 2.5105655193328857, 'NRMSEPiqr_bayesian': 0.30053127988524697, 'seed': 83}
Trial: 3
{'Ripartition': 0.25, 'Trial': 3, 'alpha_bayesian': 0.00038915994128335083, 'bayesian_time': 2.4906489849090576, 'NRMSEPiqr_bayesian': 0.2832292108297829, 'seed': 100}
Trial: 4
{'Ripartition': 0.25, 'Trial': 4, 'alpha_bayesian': 0.0003713244517597906, 'bayesian_time': 2.7323286533355713, 'NRMSEPiqr_bayesian': 0.2802772856976939, 'seed': 88}
Trial: 5
{'Ripartition': 0.25, 'Trial': 5, 'alpha_bayesian': 4.811833763690912e-05, 'bayesian_time': 3.0736653804779053, 'NRMSEPiqr_bayesian': 0.2966028263850111, 'seed': 1}
Trial: 6
{'Ripartition': 0.25, 'Trial': 6, 'alpha_bayesian': 0.0004476372870793991, 'bayesian_time': 2.6

In [6]:


df = pd.DataFrame(results_ridge)
df.to_excel("Ridge/Ridge_Bayes.xlsx")


In [7]:

ridge_alphas = {ripartitions[0]:0,ripartitions[1]:0,ripartitions[2]:0}

#Compute the mean for every ripartition
for trial in results_ridge:
    ridge_alphas[trial["Ripartition"]]+=trial["alpha_bayesian"]/trials



print(ridge_alphas)

{0.25: 0.0012489484478182295, 0.5: 0.0018748174225082694, 0.75: 0.004377694791623178}


In [8]:


trials = 100
random_cv = np.random.seed(777)
seeds = np.random.choice(trials + 1, size=trials, replace= False)

final_results_ridge = []
for alpha, ripartition in zip(ridge_alphas.values(),ripartitions):
    predictions_ridge = []
    result_NRMSEP_ridge = []
    NRMSE = []
    total_time = 0
    for i in range(trials):
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=ripartition, random_state=seeds[i])
        ridge = make_pipeline(StandardScaler(),Ridge(alpha=alpha))
            
        start_time = time.time()
        ridge.fit(X_train, Y_train)
        elapsed_time = (time.time() -start_time) #elapsed time in seconds
        
        # Make predictions using the testing set
        Y_pred_ridge = ridge.predict(X_test)

        
        NRMSE.append(NRMSEPiqr(Y_test, Y_pred_ridge))
        total_time+=elapsed_time
        
        for k in range(len(Y_pred_ridge)):

          predictions_ridge.append({"Ripartition":str(ripartition *100 ),
                        "NRMSEPiqr": NRMSEPiqr(Y_test, Y_pred_ridge),
                        "Training time": elapsed_time, 
                        "alpha": alpha, 
                        "Seed": seeds[i],
                          "Prediction": Y_pred_ridge[k], 
                          "Actual": Y_test[k] })
        
        result_NRMSEP_ridge.append({"Ripartition":str(ripartition *100 ),
                        "NRMSEPiqr": NRMSEPiqr(Y_test, Y_pred_ridge),
                        "Training time": elapsed_time, 
                        "alpha": alpha, 
                        "Seed": seeds[i],
        })
    final_results_ridge.append({"Test size(%)": str(ripartition *100 ),
                          "Mean NRMSEPiqr":np.mean(NRMSE), 
                          "Var NRMSEPiqr": np.var(NRMSE),
                          "Mean Training Time (sec)":total_time/trials, 
                          "alpha": alpha })
       
        
    df = pd.DataFrame(predictions_ridge)
    df.to_excel(f"Ridge/Ridge(Scaler)_{ripartition}_predictions.xlsx")
    df = pd.DataFrame(result_NRMSEP_ridge)
    df.to_excel(f"Ridge/Ridge(Scaler)_{ripartition}_NRMSEP.xlsx")
df = pd.DataFrame(final_results_ridge)
df.to_excel(f"Ridge/Ridge(Scaler)_final.xlsx")

**LASSO**

In [9]:



if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore" 

#number of trials
trials = 100
ranndom_cv = np.random.seed(777)
seeds = np.random.choice(trials + 1, size=trials, replace= False)

ripartitions = [0.25,0.5,0.75]

results_lasso = []  


for ripartition in ripartitions:

        run_name = str(ripartition) + " ripartition"
        print(run_name)
        for i in range(trials):
            result = {"Ripartition":None,
                       "Trial":None,
                         "alpha_bayesian":None,
                           #"alpha_evolutionary":None,
                             "bayesian_time":None,
                             #"evolutionary_time" :None,
                             "NRMSEPiqr_bayesian":None,
                              #"NRMSEPiqr_evolutionary":None,
                              "seed": None}
            

            print("Trial:",i+1)
                
            result["Ripartition"] = ripartition
            result["Trial"] = i +1
            result["seed"] = seeds[i]
            #Ripartition in training and test and
            X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=ripartition, random_state=seeds[i])
            param_grid = {'lasso__alpha': Continuous(1/100000,0.01)}
            #sturges = int(1 + np.log(len(X_train)))    
            lasso_reg_pipeline = make_pipeline(StandardScaler(),Lasso())

            kf = KFold(n_splits=5, shuffle=True,random_state=ranndom_cv)
            
            '''
            evolved_estimator = GASearchCV(estimator=lasso_reg_pipeline,
                                  cv=kf,
                                  scoring=NRMSEPiqrscorer,
                                  param_grid=param_grid,
                                  n_jobs=-1,
                                  verbose=False,
                                  population_size=10,
                                  generations=10)
            '''
            
            bayesian_estimator = BayesSearchCV(
                                lasso_reg_pipeline,
                                {'lasso__alpha':  skopt.space.space.Real(1/100000,0.01,'log-uniform')},
                                cv=kf,
                                scoring=NRMSEPiqrscorer,
                                n_jobs=-1,
                                verbose= False,
                                 n_iter=15 )
            
            '''
            start_time = time.time()
            evolved_estimator.fit(X_train, Y_train)
            elapsed_time = (time.time() -start_time) #elapsed time in seconds
            '''

            #result["evolutionary_time"] = elapsed_time
            #result["alpha_evolutionary"] = float(evolved_estimator.best_params_["lasso__alpha"])
       
            start_time = time.time()
            bayesian_estimator.fit(X_train, Y_train)
            elapsed_time = (time.time() -start_time) #elapsed time in seconds

            result["bayesian_time"] = elapsed_time
            result["alpha_bayesian"] = float(bayesian_estimator.best_params_["lasso__alpha"])
    
           
            #result["NRMSEPiqr_evolutionary"] = NRMSEPiqr(Y_test, evolved_estimator.predict(X_test))
    

            result["NRMSEPiqr_bayesian"] =  NRMSEPiqr(Y_test,bayesian_estimator.predict(X_test))
    
            print(result)
            results_lasso.append(result)




0.25 ripartition
Trial: 1
{'Ripartition': 0.25, 'Trial': 1, 'alpha_bayesian': 0.0006513691661043033, 'bayesian_time': 7.928467750549316, 'NRMSEPiqr_bayesian': 0.44823223709444915, 'seed': 26}
Trial: 2
{'Ripartition': 0.25, 'Trial': 2, 'alpha_bayesian': 0.0006238406744048299, 'bayesian_time': 7.746749401092529, 'NRMSEPiqr_bayesian': 0.4820073670628663, 'seed': 83}
Trial: 3
{'Ripartition': 0.25, 'Trial': 3, 'alpha_bayesian': 0.00038915994128335083, 'bayesian_time': 7.837031364440918, 'NRMSEPiqr_bayesian': 0.4434306764845569, 'seed': 100}
Trial: 4
{'Ripartition': 0.25, 'Trial': 4, 'alpha_bayesian': 0.0006968995460255224, 'bayesian_time': 7.747294902801514, 'NRMSEPiqr_bayesian': 0.44673611427684595, 'seed': 88}
Trial: 5
{'Ripartition': 0.25, 'Trial': 5, 'alpha_bayesian': 0.0009554942329847754, 'bayesian_time': 8.061418294906616, 'NRMSEPiqr_bayesian': 0.42246213329644006, 'seed': 1}
Trial: 6
{'Ripartition': 0.25, 'Trial': 6, 'alpha_bayesian': 0.0008395529695703012, 'bayesian_time': 7.937999

In [10]:


df = pd.DataFrame(results_lasso)
df.to_excel("Lasso/Lasso_Bayes.xlsx")


In [11]:

lasso_alphas = {ripartitions[0]:0,ripartitions[1]:0,ripartitions[2]:0}

#Compute the mean for every ripartition
for trial in results_lasso:
    lasso_alphas[trial["Ripartition"]]+=trial["alpha_bayesian"]/trials



print(lasso_alphas)

{0.25: 0.0007512782429699331, 0.5: 0.0007491104519178924, 0.75: 0.0007354937589215901}


In [12]:


trials = 100
random_cv = np.random.seed(777)
seeds = np.random.choice(trials + 1, size=trials, replace= False)

final_results_lasso = []
for alpha, ripartition in zip(lasso_alphas.values(),ripartitions):
    predictions_lasso = []
    result_NRMSEP_lasso = []
    NRMSE = []
    total_time = 0
    for i in range(trials):
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=ripartition, random_state=seeds[i])
        lasso = make_pipeline(StandardScaler(),Lasso(alpha=alpha))
            
        start_time = time.time()
        lasso.fit(X_train, Y_train)
        elapsed_time = (time.time() -start_time) #elapsed time in seconds
        
        # Make predictions using the testing set
        Y_pred_lasso = lasso.predict(X_test)

        
        NRMSE.append(NRMSEPiqr(Y_test, Y_pred_lasso))
        total_time+=elapsed_time
        
        for k in range(len(Y_pred_lasso)):

          predictions_lasso.append({"Ripartition":str(ripartition *100 ),
                        "NRMSEPiqr": NRMSEPiqr(Y_test, Y_pred_lasso),
                        "Training time": elapsed_time, 
                        "alpha": alpha, 
                        "Seed": seeds[i],
                          "Prediction": Y_pred_lasso[k], 
                          "Actual": Y_test[k] })
        
        result_NRMSEP_lasso.append({"Ripartition":str(ripartition *100 ),
                        "NRMSEPiqr": NRMSEPiqr(Y_test, Y_pred_lasso),
                        "Training time": elapsed_time, 
                        "alpha": alpha, 
                        "Seed": seeds[i],
        })
    final_results_lasso.append({"Test size(%)": str(ripartition *100 ),
                          "Mean NRMSEPiqr":np.mean(NRMSE), 
                          "Var NRMSEPiqr": np.var(NRMSE),
                          "Mean Training Time (sec)":total_time/trials, 
                          "alpha": alpha })
       
        
    df = pd.DataFrame(predictions_lasso)
    df.to_excel(f"Lasso/Lasso(Scaler)_{ripartition}_predictions.xlsx")
    df = pd.DataFrame(result_NRMSEP_lasso)
    df.to_excel(f"Lasso/Lasso(Scaler)_{ripartition}_NRMSEP.xlsx")
df = pd.DataFrame(final_results_lasso)
df.to_excel(f"Lasso/Lasso(Scaler)_final.xlsx")

**ELASTIC NET**

In [13]:



if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore" 

#number of trials
trials = 100
ranndom_cv = np.random.seed(777)
seeds = np.random.choice(trials + 1, size=trials, replace= False)

ripartitions = [0.25,0.5,0.75]

results_elastic = []  


for ripartition in ripartitions:

        run_name = str(ripartition) + " ripartition"
        print(run_name)
        for i in range(trials):
            result = {"Ripartition":None,
                       "Trial":None,
                         "alpha_bayesian":None,
                           #"alpha_evolutionary":None,
                             "bayesian_time":None,
                             #"evolutionary_time" :None,
                             "NRMSEPiqr_bayesian":None,
                             "l1_ratio_bayesian": None,
                             # "NRMSEPiqr_evolutionary":None,
                              "seed": None}
            

            print("Trial:",i+1)
                
            result["Ripartition"] = ripartition
            result["Trial"] = i +1
            result["seed"] = seeds[i]
            #Ripartition in training and test and
            X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=ripartition, random_state=seeds[i])
            param_grid = {'elasticnet__alpha': Continuous(1/100000,0.01)}
            #sturges = int(1 + np.log(len(X_train)))    
            elastic_reg_pipeline = make_pipeline(StandardScaler(),ElasticNet())

            kf = KFold(n_splits=5, shuffle=True,random_state=ranndom_cv)
            '''
            evolved_estimator = GASearchCV(estimator=elastic_reg_pipeline,
                                  cv=kf,
                                  scoring=NRMSEPiqrscorer,
                                  param_grid=param_grid,
                                  n_jobs=-1,
                                  verbose=False,
                                  population_size=10,
                                  generations=10)
            '''
            bayesian_estimator = BayesSearchCV(
                                elastic_reg_pipeline,
                                {'elasticnet__alpha':  skopt.space.space.Real(1/100000,0.01,'log-uniform'),
                                 'elasticnet__l1_ratio': skopt.space.space.Real(0,1,'uniform')},
                                cv=kf,
                                scoring=NRMSEPiqrscorer,
                                n_jobs=-1,
                                verbose= False,
                                 n_iter=15 )
            '''
            start_time = time.time()
            evolved_estimator.fit(X_train, Y_train)
            elapsed_time = (time.time() -start_time) #elapsed time in seconds

            result["evolutionary_time"] = elapsed_time
            result["alpha_evolutionary"] = float(evolved_estimator.best_params_["elasticnet__alpha"])
            '''

            start_time = time.time()
            bayesian_estimator.fit(X_train, Y_train)
            elapsed_time = (time.time() -start_time) #elapsed time in seconds

            result["bayesian_time"] = elapsed_time
            result["alpha_bayesian"] = float(bayesian_estimator.best_params_["elasticnet__alpha"])
            result["l1_ratio_bayesian"] = float(bayesian_estimator.best_params_["elasticnet__l1_ratio"])
    
           
            #result["NRMSEPiqr_evolutionary"] = NRMSEPiqr(Y_test, evolved_estimator.predict(X_test))
    

            result["NRMSEPiqr_bayesian"] =  NRMSEPiqr(Y_test,bayesian_estimator.predict(X_test))
    
            print(result)
            results_elastic.append(result)




0.25 ripartition
Trial: 1
{'Ripartition': 0.25, 'Trial': 1, 'alpha_bayesian': 0.0012717248103334222, 'bayesian_time': 8.48729395866394, 'NRMSEPiqr_bayesian': 0.4418837292813918, 'l1_ratio_bayesian': 0.5420487122586486, 'seed': 26}
Trial: 2
{'Ripartition': 0.25, 'Trial': 2, 'alpha_bayesian': 0.00564524297067892, 'bayesian_time': 8.245477437973022, 'NRMSEPiqr_bayesian': 0.44885791868179503, 'l1_ratio_bayesian': 0.04198371649724098, 'seed': 83}
Trial: 3
{'Ripartition': 0.25, 'Trial': 3, 'alpha_bayesian': 0.0026380215253262234, 'bayesian_time': 8.051987886428833, 'NRMSEPiqr_bayesian': 0.4220395561781063, 'l1_ratio_bayesian': 0.21735226225961285, 'seed': 100}
Trial: 4
{'Ripartition': 0.25, 'Trial': 4, 'alpha_bayesian': 0.0008203900226017852, 'bayesian_time': 8.23078203201294, 'NRMSEPiqr_bayesian': 0.4458731537726274, 'l1_ratio_bayesian': 0.9474599464986425, 'seed': 88}
Trial: 5
{'Ripartition': 0.25, 'Trial': 5, 'alpha_bayesian': 0.002276121343301471, 'bayesian_time': 8.745054721832275, 'NRM

In [14]:
from collections import Counter

df = pd.DataFrame(results_elastic)
df.to_excel("Elastic/Elastic_Bayes.xlsx")



In [15]:
elastic_alphas = {ripartitions[0]:0,ripartitions[1]:0,ripartitions[2]:0}
elastic_ratio = {ripartitions[0]:0,ripartitions[1]:0,ripartitions[2]:0}

#Compute the mean for every ripartition
for trial in results_elastic:
    elastic_alphas[trial["Ripartition"]]+=trial["alpha_bayesian"]/trials
    elastic_ratio[trial["Ripartition"]]+=trial["l1_ratio_bayesian"]/trials


print(elastic_alphas)
print(elastic_ratio)

{0.25: 0.003003990406108207, 0.5: 0.003100424597781945, 0.75: 0.0022689956762280824}
{0.25: 0.3201375041104289, 0.5: 0.3066387786514995, 0.75: 0.41738125631373246}


In [16]:


trials = 100
random_cv = np.random.seed(777)
seeds = np.random.choice(trials + 1, size=trials, replace= False)

final_results_elastic = []
for alpha, l1_ratio, ripartition in zip (elastic_alphas.values(), elastic_ratio.values(),ripartitions):
    predictions_elastic = []
    result_NRMSEP_elastic = []
    NRMSE = []
    total_time = 0
    for i in range(trials):
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=ripartition, random_state=seeds[i])
        elastic = make_pipeline(StandardScaler(), ElasticNet(alpha=alpha, l1_ratio=l1_ratio))
            
        start_time = time.time()
        elastic.fit(X_train, Y_train)
        elapsed_time = (time.time() -start_time) #elapsed time in seconds
        
        # Make predictions using the testing set
        Y_pred_elastic = elastic.predict(X_test)

        
        NRMSE.append(NRMSEPiqr(Y_test, Y_pred_elastic))
        total_time+=elapsed_time
        
        for k in range(len(Y_pred_elastic)):

          predictions_elastic.append({"Ripartition":str(ripartition *100 ),
                        "NRMSEPiqr": NRMSEPiqr(Y_test, Y_pred_elastic),
                        "Training time": elapsed_time, 
                        "alpha": alpha, 
                        "l1_ratio": l1_ratio,
                        "Seed": seeds[i],
                          "Prediction": Y_pred_elastic[k], 
                          "Actual": Y_test[k] })
        
        result_NRMSEP_elastic.append({"Ripartition":str(ripartition *100 ),
                        "NRMSEPiqr": NRMSEPiqr(Y_test, Y_pred_elastic),
                        "Training time": elapsed_time, 
                        "alpha": alpha, 
                        "l1_ratio": l1_ratio,
                        "Seed": seeds[i],
        })
    final_results_elastic.append({"Test size(%)": str(ripartition *100 ),
                          "Mean NRMSEPiqr":np.mean(NRMSE), 
                          "Var NRMSEPiqr": np.var(NRMSE),
                          "Mean Training Time (sec)":total_time/trials, 
                          "alpha": alpha,
                           "l1_ratio": l1_ratio })
                           
    df = pd.DataFrame(predictions_elastic)
    df.to_excel(f"Elastic/Elastic(Scaler)_{ripartition}_predictions.xlsx")
    df = pd.DataFrame(result_NRMSEP_elastic)
    df.to_excel(f"Elastic/Elastic(Scaler)_{ripartition}_NRMSEP.xlsx")
df = pd.DataFrame(final_results_elastic)
df.to_excel(f"Elastic/Elastic(Scaler)_final.xlsx")