In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tabulate import tabulate
import warnings
from sklearn_genetic import GASearchCV
from sklearn_genetic.space import Integer
from sklearn.metrics import make_scorer
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import train_test_split,  cross_validate
from sklearn.preprocessing import StandardScaler
import time
import sys, os
from sklearn.pipeline import make_pipeline
from skopt import BayesSearchCV
import skopt.space.space
from sklearn.model_selection import KFold


path = 'datasets/Dataset_2.xlsx'

# Read and load dataset
df= pd.read_excel(path, sheet_name="preproc")





  from .autonotebook import tqdm as notebook_tqdm


INPUT DATA

In [2]:
X = (df.iloc[:,14:]).values
print(X.shape)
std_scaler = StandardScaler()

(2905, 530)


OUTPUT DATA

In [3]:
Y = (df.iloc[:,7]).values
print(Y.shape)

(2905,)


**NRMSEPiqr**

In [4]:


def NRMSEPiqr(observed_values, predicted_values):
    # Calculate RMSEP
    rmsep = np.sqrt(np.mean((observed_values - predicted_values) ** 2))
    # Calculate Q1 (25th percentile) and Q3 (75th percentile)
    Q1 = np.percentile(observed_values, 25)
    Q3 = np.percentile(observed_values, 75)

    # Calculate IQR
    IQR = Q3 - Q1

    return rmsep/IQR

NRMSEPiqrscorer = make_scorer(NRMSEPiqr,greater_is_better=False)


PLS NRMSEPiqr

In [5]:
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore" 


trials = 100
ranndom_cv = np.random.seed(777)
seeds = np.random.choice(trials + 1, size=trials, replace=False)

ripartitions = [0.25, 0.50, 0.75]

results = []  

for ripartition in ripartitions:
        
    run_name = str(ripartition) + " ripartition"
    print(run_name)
    for i in range(trials):
        result = {"Ripartition":None,
                   "Trial":None,
                         "n_components_bayesian":None,
                           #"n_components_evolutionary":None,
                             "bayesian_time":None,
                             #"evolutionary_time" :None,
                             "NRMSEPiqr_bayesian":None,
                              #"NRMSEPiqr_evolutionary":None,
                               "seed": None}
            

        print("Trial:",i+1)
                
        result["Ripartition"] = ripartition
        result["Trial"] = i +1  
        result["seed"] = seeds[i]
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=ripartition, random_state=seeds[i])
        param_grid = {'pls__n_components': Integer(2,530,random_state=42),'pls_max_iter': Integer(100, 500)}  
        pls =  make_pipeline(StandardScaler(),PLSRegression())
        
        kf = KFold(n_splits=5, shuffle=True,random_state=ranndom_cv)
        #sturges = int(1 + np.log(len(X_train)))    
        '''       
        evolved_estimator = GASearchCV(estimator=pls,
                                cv=kf,
                                scoring=NRMSEPiqrscorer,
                                param_grid=param_grid,
                                n_jobs=-1,
                                verbose=False,
                                population_size=10,
                                generations=10)  
        '''
        
        bayesian_estimator = BayesSearchCV(
                                pls,
                                {'plsregression__n_components':skopt.space.space.Integer(1,100)},
                                cv=kf,
                                scoring=NRMSEPiqrscorer,
                                n_jobs=-1,
                                verbose= False,
                                 n_iter=15 )
        '''    
        start_time = time.time()
        evolved_estimator.fit(X_train, Y_train)
        elapsed_time = (time.time() -start_time) #elapsed time in seconds

        result["evolutionary_time"] = elapsed_time
        result["n_components_evolutionary"] = float(evolved_estimator.best_params_["n_components"])
       '''
        start_time = time.time()
        bayesian_estimator.fit(X_train, Y_train)
        elapsed_time = (time.time() -start_time) #elapsed time in seconds

        result["bayesian_time"] = elapsed_time
        result["n_components_bayesian"] = float(bayesian_estimator.best_params_["plsregression__n_components"])
    
           
        #result["NRMSEPiqr_evolutionary"] = NRMSEPiqr(Y_test, evolved_estimator.predict(X_test))
    

        result["NRMSEPiqr_bayesian"] =  NRMSEPiqr(Y_test,bayesian_estimator.predict(X_test))
    
        print(result)
        results.append(result)

0.25 ripartition
Trial: 1
{'Ripartition': 0.25, 'Trial': 1, 'n_components_bayesian': 80.0, 'bayesian_time': 7.193959712982178, 'NRMSEPiqr_bayesian': 0.3080228344248824, 'seed': 26}
Trial: 2
{'Ripartition': 0.25, 'Trial': 2, 'n_components_bayesian': 55.0, 'bayesian_time': 6.462155103683472, 'NRMSEPiqr_bayesian': 0.3007208441578556, 'seed': 83}
Trial: 3
{'Ripartition': 0.25, 'Trial': 3, 'n_components_bayesian': 53.0, 'bayesian_time': 6.659924745559692, 'NRMSEPiqr_bayesian': 0.2874402252047902, 'seed': 100}
Trial: 4
{'Ripartition': 0.25, 'Trial': 4, 'n_components_bayesian': 53.0, 'bayesian_time': 7.577728033065796, 'NRMSEPiqr_bayesian': 0.28019844491158197, 'seed': 88}
Trial: 5
{'Ripartition': 0.25, 'Trial': 5, 'n_components_bayesian': 67.0, 'bayesian_time': 6.584488391876221, 'NRMSEPiqr_bayesian': 0.300258127720901, 'seed': 1}
Trial: 6
{'Ripartition': 0.25, 'Trial': 6, 'n_components_bayesian': 58.0, 'bayesian_time': 6.614706516265869, 'NRMSEPiqr_bayesian': 0.30615401723923236, 'seed': 85

In [6]:


df = pd.DataFrame(results)
df.to_excel("PLS/PLS_Bayes.xlsx")



In [7]:
from collections import Counter

pls_components = {ripartitions[0]:0,ripartitions[1]:0,ripartitions[2]:0}

#Compute the mean for every ripartition
for trial in results:
    pls_components[trial["Ripartition"]]+=trial["n_components_bayesian"]/trials



print("Average values:",pls_components)



Average values: {0.25: 59.16999999999998, 0.5: 56.73000000000001, 0.75: 40.25}


PLS NRMSEPiqr best components

In [8]:

trials = 100
random_cv = np.random.seed(777)
seeds = np.random.choice(trials + 1, size=trials, replace= False)

final_results_pls = []
for n_components, ripartition in zip (pls_components.values(),ripartitions):
    predictions_pls = []
    result_NRMSEP_pls = []   
    
    n_components = round(n_components)
    NRMSE = []
    total_time = 0
    for i in range(trials):
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=ripartition, random_state=seeds[i])
        pls_model = make_pipeline(StandardScaler(),PLSRegression(n_components=n_components))
            
        start_time = time.time()
        pls_model.fit(X_train, Y_train)
        elapsed_time = (time.time() -start_time) #elapsed time in seconds
        
        # Make predictions using the testing set
        Y_pred_pls = pls_model.predict(X_test)

        
        NRMSE.append(NRMSEPiqr(Y_test, Y_pred_pls))
        total_time+=elapsed_time
        
        for k in range(len(Y_pred_pls)):

          predictions_pls.append({"Ripartition":str(ripartition *100 ),
                        "NRMSEPiqr": NRMSEPiqr(Y_test, Y_pred_pls),
                        "Training time": elapsed_time, 
                        "N_components": n_components, 
                        "Seed": seeds[i],
                          "Prediction": Y_pred_pls[k], 
                          "Actual": Y_test[k] })
          
        result_NRMSEP_pls.append({"Ripartition":str(ripartition *100 ),
                        "NRMSEPiqr": NRMSEPiqr(Y_test, Y_pred_pls),
                        "Training time": elapsed_time, 
                        "N_components": n_components,  
                        "Seed": seeds[i],
        })
    final_results_pls.append({"Test size(%)": str(ripartition *100 ),
                          "Mean NRMSEPiqr":np.mean(NRMSE), 
                          "Var NRMSEPiqr": np.var(NRMSE),
                          "Mean Training Time (sec)":total_time/trials, 
                          "N_components": n_components })     
    
    df = pd.DataFrame(predictions_pls)
    df.to_excel(f"PLS/PLS_{ripartition}_predictions.xlsx", index=False)
    df = pd.DataFrame(result_NRMSEP_pls)
    df.to_excel(f"PLS/PLS_{ripartition}_NRMSEP.xlsx",index=False)
df = pd.DataFrame(final_results_pls)
df.to_excel(f"PLS/PLS_final.xlsx",index=False)


