In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tabulate import tabulate
import warnings
from sklearn_genetic import GASearchCV
from sklearn_genetic.space import Categorical, Integer, Continuous
from sklearn.metrics import make_scorer
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split,  cross_validate
from sklearn.preprocessing import StandardScaler
import time
import sys, os
from sklearn.pipeline import make_pipeline
from skopt import BayesSearchCV
import skopt.space.space
from sklearn.model_selection import KFold


path = 'datasets/Dataset_2.xlsx'

# Read and load dataset
df= pd.read_excel(path, sheet_name="preproc")





  from .autonotebook import tqdm as notebook_tqdm


INPUT DATA

In [2]:
X = (df.iloc[:,14:]).values
print(X.shape)
std_scaler = StandardScaler()

(2905, 530)


OUTPUT DATA

In [3]:
Y = (df.iloc[:,7]).values
print(Y.shape)

(2905,)


**NRMSEPiqr**

In [4]:


def NRMSEPiqr(observed_values, predicted_values):
    # Calculate RMSEP
    rmsep = np.sqrt(np.mean((observed_values - predicted_values) ** 2))
    # Calculate Q1 (25th percentile) and Q3 (75th percentile)
    Q1 = np.percentile(observed_values, 25)
    Q3 = np.percentile(observed_values, 75)

    # Calculate IQR
    IQR = Q3 - Q1

    return rmsep/IQR

NRMSEPiqrscorer = make_scorer(NRMSEPiqr,greater_is_better=False)


SVR

In [None]:
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore" 


trials = 100
ranndom_cv = np.random.seed(777)
seeds = np.random.choice(trials, size=trials)

ripartitions = [0.25, 0.50, 0.75]

results = []  

for ripartition in ripartitions:
        
    run_name = str(ripartition) + " ripartition"
    print(run_name)
    for i in range(trials):
        result = {"Ripartition":None,
                   "Trial":None,
                    "C_bayesian":None,
                    # "C_evolutionary":None,
                      "epsilon_bayesian":None,
                     #  "epsilon_evolutionary":None,
                        "bayesian_time":None,
                      #    "evolutionary_time" :None,
                            "NRMSEPiqr_bayesian":None,
                       #       "NRMSEPiqr_evolutionary":None,
                                "seed": None}
            

        print("Trial:",i+1)
                
        result["Ripartition"] = ripartition
        result["Trial"] = i +1  
        result["seed"] = seeds[i]
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=ripartition, random_state=seeds[i])
        '''     
        param_grid = {
            'C': Continuous(1/10000,10),
            "epsilon":Continuous(1/100000,1),
            "gamma": 
            
            }
          '''
        svr_regressor =  SVR(kernel="linear", gamma="scale")
        
        kf = KFold(n_splits=5, shuffle=True,random_state=ranndom_cv)
        #sturges = int(1 + np.log(len(X_train)))    
        '''           
        evolved_estimator = GASearchCV(estimator=svr_regressor,
                                cv=kf,
                                scoring=NRMSEPiqrscorer,
                                param_grid=param_grid,
                                n_jobs=-1,
                                verbose=False,
                                population_size=10,
                                generations=10)  
        
        '''
        bayesian_estimator = BayesSearchCV(
                                svr_regressor,
                                {
                                'C':skopt.space.space.Integer(1,5000,"log-uniform"),
                                "epsilon":skopt.space.space.Real(1e-4,1,"log-uniform")},
                                cv=kf,
                                scoring=NRMSEPiqrscorer,
                                n_jobs=-1,
                                verbose= False,
                                 n_iter=15 )
            
        '''
        start_time = time.time()
        evolved_estimator.fit(X_train, Y_train)
        elapsed_time = (time.time() -start_time) #elapsed time in seconds

        result["evolutionary_time"] = elapsed_time
        result["C_evolutionary"] = float(evolved_estimator.best_params_["C"])
        result["epsilon_evolutionary"] = float(evolved_estimator.best_params_["epsilon"])
       '''
        start_time = time.time()
        bayesian_estimator.fit(X_train, Y_train)
        elapsed_time = (time.time() -start_time) #elapsed time in seconds

        result["bayesian_time"] = elapsed_time
        result["C_bayesian"] = float(bayesian_estimator.best_params_["C"])
        result["epsilon_bayesian"] = float(bayesian_estimator.best_params_["epsilon"])
           
        #result["NRMSEPiqr_evolutionary"] = NRMSEPiqr(Y_test, evolved_estimator.predict(X_test))
    

        result["NRMSEPiqr_bayesian"] =  NRMSEPiqr(Y_test,bayesian_estimator.predict(X_test))
    
        print(result)
        results.append(result)

0.25 ripartition
Trial: 1
{'Ripartition': 0.25, 'Trial': 1, 'C_bayesian': 5000.0, 'epsilon_bayesian': 0.00037812012679725154, 'bayesian_time': 108.51133227348328, 'NRMSEPiqr_bayesian': 0.3712518605354573, 'seed': 47}
Trial: 2


In [6]:


df = pd.DataFrame(results)
df.to_excel("SVR_Bayes.xlsx")


In [7]:
from collections import Counter


svr_C = {ripartitions[0]:0,ripartitions[1]:0,ripartitions[2]:0}
svr_epsilon= {ripartitions[0]:0,ripartitions[1]:0,ripartitions[2]:0}
svr_kernel = {ripartitions[0]:None,ripartitions[1]:None,ripartitions[2]:None}
svr_gamma= {ripartitions[0]:None,ripartitions[1]:None,ripartitions[2]:None}

#Compute the mean for every ripartition
for trial in results:
    svr_C[trial["Ripartition"]]+=trial["C_bayesian"]/trials
    svr_epsilon[trial["Ripartition"]]+=trial["epsilon_bayesian"]/trials


print("Average C: ",svr_C)
print("Average epsilon: ",svr_epsilon)

Average C:  {0.25: 3761.44, 0.5: 3638.2899999999995, 0.75: 3916.7599999999984}
Average epsilon:  {0.25: 0.019753652811608926, 0.5: 0.04035784151296806, 0.75: 0.020908433809266994}


In [1]:

results = []  

# create 3x1 subfigs
fig, axs = plt.subplots(3, 1,figsize=(20, 25),gridspec_kw={'hspace': 0.5})
fig.suptitle(f'SVR NITOROGEN NRMSEPiqr best values test' + "\n")
axs = axs.flatten()
k = 0

trials = 100
random_cv = np.random.seed(777)
seeds = np.random.choice(trials, size=trials)
    

results = []
ripartitions = [0.25, 0.50, 0.75]
for C, epsilon,ripartition in zip (svr_C.values(), svr_epsilon.values(),ripartitions):
    
    NRMSE = []
    total_time = 0
    for i in range(trials):
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=ripartition, random_state=seeds[i])
        svr_regressor = SVR(C=C,epsilon=epsilon, kernel="linear", gamma="scale")
            
        start_time = time.time()
        svr_regressor.fit(X_train, Y_train)
        elapsed_time = (time.time() -start_time) #elapsed time in seconds
        
        # Make predictions using the testing set
        Y_pred_svr = svr_regressor.predict(X_test)

        
        NRMSE.append(NRMSEPiqr(Y_test, Y_pred_svr))
        total_time+=elapsed_time
        

    results.append([str(ripartition *100 ),np.mean(NRMSE), np.var(NRMSE),
                       total_time/trials, C, epsilon ])

    '''PLOT THE BEST MODELS'''
    axs[k].scatter(Y_test,Y_pred_svr, c='blue', label='Actual vs Predicted')
    axs[k].plot([min(Y_test), max(Y_test)],
                            [min(Y_test), max(Y_test)], '--', c='red', label='Perfect Prediction')
    axs[k].legend()
    axs[k].set_xlabel("Actual values")
    axs[k].set_ylabel("Predicted value")
    axs[k].set_title( f"Test size(%): {ripartition * 100}"+ "\n" + "Mean NRMSEPiqr: " + str(results[k][1]) )
    k += 1

    '''SHOW RESULTS'''

headers = ["Test size(%)", "Mean NRMSEPiqr", "Var NRMSEPiqr","Mean Training Time (sec)" ,"C" , "epsilon"]
print(tabulate(results, headers,  tablefmt="double_outline"))


NameError: name 'plt' is not defined

In [None]:
df = pd.DataFrame(results)
df.to_excel("SVR_Results.xlsx")