In [23]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, RBF, Matern, RationalQuadratic
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score

In [37]:
from tqdm import tqdm
import time

# Data Load 

In [3]:
def data_loading():
    """
    This function loads the training and test data, preprocesses it, removes the NaN values and interpolates the missing 
    data using imputation

    Parameters
    ----------
    Returns
    ----------
    X_train: matrix of floats, training input with features
    y_train: array of floats, training output with labels
    X_test: matrix of floats: dim = (100, ?), test input with features
    """
    # Load training data
    train_df = pd.read_csv("train.csv")
    
    print("Training data:")
    print("Shape:", train_df.shape)
    print(train_df.head(2))
    print('\n')
    
    # Load test data
    test_df = pd.read_csv("test.csv")

    print("Test data:")
    print(test_df.shape)
    print(test_df.head(2))

    # Dummy initialization of the X_train, X_test and y_train   
    X_train = np.zeros_like(train_df.drop(['price_CHF'],axis=1))
    y_train = np.zeros_like(train_df['price_CHF'])
    X_test = np.zeros_like(test_df)

    # TODO: Perform data preprocessing, imputation and extract X_train, y_train and X_test
    train_arr = np.array(train_df)
    test_arr = np.array(test_df)
    
    # one hot encoder
    train_arr = np.array(train_df)
    test_arr = np.array(test_df)
    
    # one hot encoder
    column_to_encode = 0
    train_to_encode = train_arr[:, column_to_encode].reshape(-1, 1)
    test_to_encode = test_arr[:, column_to_encode].reshape(-1, 1)
    encoder = OneHotEncoder()
    encoder.fit(train_to_encode)
    encoded_train_data = encoder.transform(train_to_encode)
    encoded_train_arr = np.concatenate((train_arr[:, :column_to_encode],
                            encoded_train_data.toarray(),
                            train_arr[:, column_to_encode+1:]), axis=1)
    
    encoded_test_data = encoder.transform(test_to_encode)
    encoded_test_arr = np.concatenate((test_arr[:, :column_to_encode],
                            encoded_test_data.toarray(),
                            test_arr[:, column_to_encode+1:]),axis=1)
    
    # KNN Imputer
    imputer = KNNImputer(n_neighbors=15, weights = 'uniform')
    imputed_train = imputer.fit_transform(encoded_train_arr)
    imputed_test = imputer.fit_transform(encoded_test_arr)
    
    # delete the price of electric of swi
    train_idx = [i for i in range(encoded_train_arr.shape[0]) if np.isnan(encoded_train_arr[i,5]) == False] 
    imputed_train_refined = imputed_train[train_idx]
    
    X_train = np.delete(imputed_train_refined, 5, 1)
    y_train = imputed_train_refined[:, 5]
    
    X_test = imputed_test
    
    assert (X_train.shape[1] == X_test.shape[1]) and (X_train.shape[0] == y_train.shape[0]) and (X_test.shape[0] == 100), "Invalid data shape"
    return X_train, y_train, X_test

In [4]:
X_train, y_train, X_test = data_loading()

Training data:
Shape: (900, 11)
   season  price_AUS  price_CHF  price_CZE  price_GER  price_ESP  price_FRA  \
0  spring        NaN   9.644028  -1.686248  -1.748076  -3.666005        NaN   
1  summer        NaN   7.246061  -2.132377  -2.054363  -3.295697  -4.104759   

   price_UK  price_ITA  price_POL  price_SVK  
0 -1.822720  -3.931031        NaN  -3.238197  
1 -1.826021        NaN        NaN  -3.212894  


Test data:
(100, 10)
   season  price_AUS  price_CZE  price_GER  price_ESP  price_FRA  price_UK  \
0  spring        NaN   0.472985   0.707957        NaN  -1.136441 -0.596703   
1  summer  -1.184837   0.358019        NaN  -3.199028  -1.069695       NaN   

   price_ITA  price_POL  price_SVK  
0        NaN   3.298693   1.921886  
1  -1.420091   3.238307        NaN  


# Model Test and Optimize

## Kernel Optimization 

### Matern Kernel 

In [10]:
ls_range = np.logspace(-5, 1, 100, base=10)
nu_range = np.logspace(-5, 1, 100, base=10)

In [16]:
params = [  (0.05, 0.5), (0.05, 1.5), (0.05, 2.5),
                (0.1, 0.5), (0.1, 1.5), (0.1, 2.5),
                (1, 0.5), (1, 1.5), (1, 2.5)]
for i in range (len(ls_range)):
    for j in range (len(nu_range)):
        params.append((ls_range[i], nu_range[j]))

In [51]:
def Matern_para_select(X, y, n_folds):
    print("————Matern————")
    ls_range = np.logspace(-2, 0.5, 10, base=10)
    nu_range = np.logspace(-2, 0.5, 10, base=10)
    params = [  (0.05, 0.5), (0.05, 1.5), (0.05, 2.5),
                (0.1, 0.5), (0.1, 1.5), (0.1, 2.5),
                (1, 0.5), (1, 1.5), (1, 2.5)]
    for i in range (len(ls_range)):
        for j in range (len(nu_range)):
            params.append((ls_range[i], nu_range[j]))
    pbar = tqdm(total=len(params)*n_folds)
    R2score_mat = np.zeros((n_folds, len(params)))
    kf = KFold(n_splits=n_folds, shuffle=True, random_state= 14)
    for fold_idx, (train, test) in enumerate(kf.split(X)):
        X_train, X_val, y_train, y_val = X[train], X[test], y[train], y[test]
        for idx, (ls, nu) in enumerate(params):
            gpr = GaussianProcessRegressor(kernel= Matern(length_scale=ls, nu=nu))
            gpr.fit(X_train, y_train)
            y_val_pred = gpr.predict(X_val)
            R2score_mat[fold_idx][idx] = r2_score(y_val, y_val_pred)
            pbar.update()
    pbar.close()
    
    avg_R2score = np.mean(R2score_mat, axis=0)
    print("Best Param: ", params[np.argmax([avg_R2score])], " Score: ", avg_R2score[np.argmax([avg_R2score])])
    print(avg_R2score)
    print("______________end_____________")
    return avg_R2score, params[np.argmax([avg_R2score])]

In [52]:
n_folds = 10
avg_R2score_M, best_para_M = Matern_para_select(X_train, y_train, n_folds)




  0%|                                                   | 0/109 [00:00<?, ?it/s][A[A[A


  2%|▊                                          | 2/109 [00:00<00:09, 11.10it/s][A[A[A

————Matern————





  3%|█▏                                         | 3/109 [00:00<00:10,  9.94it/s][A[A[A


  5%|█▉                                         | 5/109 [00:00<00:09, 10.74it/s][A[A[A


  6%|██▊                                        | 7/109 [00:00<00:08, 11.49it/s][A[A[A


  8%|███▌                                       | 9/109 [00:00<00:08, 11.96it/s][A[A[A


 10%|████▏                                     | 11/109 [00:03<00:48,  2.00it/s][A[A[A


 11%|████▌                                     | 12/109 [00:05<01:21,  1.19it/s][A[A[A


 12%|█████                                     | 13/109 [00:06<01:30,  1.06it/s][A[A[A


 13%|█████▍                                    | 14/109 [00:09<02:29,  1.57s/it][A[A[A


 14%|█████▊                                    | 15/109 [00:11<02:49,  1.80s/it][A[A[A


 15%|██████▏                                   | 16/109 [00:12<02:04,  1.34s/it][A[A[A


 16%|██████▌                                   | 17/109 [00:12<01:34,  1.02s/

 76%|███████████████████████████████▉          | 83/109 [02:31<00:48,  1.87s/it][A[A[A


 77%|████████████████████████████████▎         | 84/109 [02:32<00:41,  1.67s/it][A[A[A


 78%|████████████████████████████████▊         | 85/109 [02:34<00:36,  1.53s/it][A[A[A


 79%|█████████████████████████████████▏        | 86/109 [02:36<00:38,  1.67s/it][A[A[A


 80%|█████████████████████████████████▌        | 87/109 [02:37<00:36,  1.64s/it][A[A[A


 81%|█████████████████████████████████▉        | 88/109 [02:40<00:40,  1.94s/it][A[A[A


 82%|██████████████████████████████████▎       | 89/109 [02:41<00:35,  1.79s/it][A[A[A


 83%|██████████████████████████████████▋       | 90/109 [02:43<00:32,  1.70s/it][A[A[A


 83%|███████████████████████████████████       | 91/109 [02:44<00:29,  1.64s/it][A[A[A


 84%|███████████████████████████████████▍      | 92/109 [02:46<00:25,  1.52s/it][A[A[A


 85%|███████████████████████████████████▊      | 93/109 [02:47<00:22,  1.41s/it]

218it [05:46,  1.34it/s][A[A[A


220it [05:47,  1.81it/s][A[A[A


221it [05:47,  2.35it/s][A[A[A


223it [05:47,  3.09it/s][A[A[A


225it [05:47,  4.03it/s][A[A[A


227it [05:47,  5.17it/s][A[A[A


229it [05:50,  1.79it/s][A[A[A


230it [05:51,  1.20it/s][A[A[A


231it [05:53,  1.02it/s][A[A[A


232it [05:54,  1.12s/it][A[A[A


233it [05:56,  1.43s/it][A[A[A


234it [05:57,  1.08s/it][A[A[A


235it [05:57,  1.20it/s][A[A[A


236it [05:57,  1.59it/s][A[A[A


237it [05:57,  2.11it/s][A[A[A


238it [05:58,  1.36it/s][A[A[A


239it [06:00,  1.15it/s][A[A[A


240it [06:01,  1.05s/it][A[A[A


241it [06:03,  1.18s/it][A[A[A


242it [06:04,  1.18s/it][A[A[A


243it [06:05,  1.17s/it][A[A[A


244it [06:07,  1.34s/it][A[A[A


245it [06:11,  2.12s/it][A[A[A


246it [06:13,  2.19s/it][A[A[A


247it [06:13,  1.61s/it][A[A[A


248it [06:15,  1.56s/it][A[A[A


249it [06:16,  1.56s/it][A[A[A


250it [06:18,  1.52s/it][A

406it [10:55,  3.93s/it][A[A[A


407it [10:56,  3.14s/it][A[A[A


408it [10:58,  2.69s/it][A[A[A


409it [10:59,  2.38s/it][A[A[A


410it [11:01,  2.10s/it][A[A[A


411it [11:02,  1.80s/it][A[A[A


412it [11:04,  1.97s/it][A[A[A


413it [11:06,  1.87s/it][A[A[A


414it [11:07,  1.78s/it][A[A[A


415it [11:09,  1.77s/it][A[A[A


416it [11:10,  1.68s/it][A[A[A


417it [11:12,  1.72s/it][A[A[A


418it [11:14,  1.65s/it][A[A[A


419it [11:15,  1.53s/it][A[A[A


420it [11:16,  1.42s/it][A[A[A


421it [11:19,  1.69s/it][A[A[A


422it [11:20,  1.49s/it][A[A[A


423it [11:21,  1.44s/it][A[A[A


424it [11:22,  1.32s/it][A[A[A


425it [11:25,  1.75s/it][A[A[A


426it [11:28,  2.21s/it][A[A[A


427it [11:29,  1.92s/it][A[A[A


428it [11:31,  1.78s/it][A[A[A


429it [11:33,  1.90s/it][A[A[A


430it [11:34,  1.69s/it][A[A[A


431it [11:35,  1.57s/it][A[A[A


432it [11:36,  1.39s/it][A[A[A


433it [11:39,  1.69s/it][A

578it [15:24,  1.77s/it][A[A[A


579it [15:25,  1.55s/it][A[A[A


580it [15:26,  1.40s/it][A[A[A


581it [15:28,  1.54s/it][A[A[A


582it [15:30,  1.78s/it][A[A[A


583it [15:33,  1.91s/it][A[A[A


584it [15:35,  2.03s/it][A[A[A


585it [15:37,  2.05s/it][A[A[A


586it [15:39,  1.96s/it][A[A[A


587it [15:41,  1.90s/it][A[A[A


588it [15:42,  1.85s/it][A[A[A


589it [15:44,  1.71s/it][A[A[A


590it [15:45,  1.51s/it][A[A[A


591it [15:47,  1.73s/it][A[A[A


592it [15:49,  1.95s/it][A[A[A


593it [15:51,  1.94s/it][A[A[A


594it [15:55,  2.50s/it][A[A[A


595it [15:57,  2.35s/it][A[A[A


596it [15:59,  2.19s/it][A[A[A


597it [16:00,  1.96s/it][A[A[A


598it [16:02,  1.83s/it][A[A[A


599it [16:03,  1.70s/it][A[A[A


600it [16:05,  1.60s/it][A[A[A


601it [16:06,  1.59s/it][A[A[A


602it [16:08,  1.62s/it][A[A[A


603it [16:10,  1.59s/it][A[A[A


604it [16:13,  2.30s/it][A[A[A


605it [16:15,  2.06s/it][A

753it [20:35,  1.04it/s][A[A[A


754it [20:36,  1.04s/it][A[A[A


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)



756it [20:42,  2.13s/it][A[A[A


757it [20:43,  1.83s/it][A[A[A


758it [20:44,  1.56s/it][A[A[A


759it [20:50,  2.77s/it][A[A[A


760it [20:51,  2.17s/it][A[A[A








763it [20:54,  1.33s/it][A[A[A


765it [20:54,  1.05it/s][A[A[A


766it [20:54,  1.43it/s][A[A[A


768it [20:55,  1.94it/s][A[A[A


769it [20:55,  2.55it/s][A[A[A


771it [20:55,  3.39it/s][A[A[A


773it [20:56,  2.56it/s][A[A[A


774it [20:58,  1.41it/s][A[A[A


775it [20:59,  1.11it/s][A[A[A


776it [21:00,  1.04s/it][A[A[A


777it [21:03,  1.49s/it][A[A[A


778it [21:05,  1.74s/it][A[A[A


779it [21:05,  1.29s/it][A[A[A


780it [21:06,  1.02it/s][A[A[A


781it [21:06,  1.37i

904it [24:30,  1.61s/it][A[A[A


905it [24:31,  1.46s/it][A[A[A


906it [24:32,  1.32s/it][A[A[A


907it [24:33,  1.24s/it][A[A[A


908it [24:35,  1.42s/it][A[A[A


909it [24:37,  1.74s/it][A[A[A


910it [24:40,  2.16s/it][A[A[A


911it [24:44,  2.51s/it][A[A[A


912it [24:45,  2.21s/it][A[A[A


913it [24:47,  2.07s/it][A[A[A


914it [24:49,  1.96s/it][A[A[A


915it [24:50,  1.79s/it][A[A[A


916it [24:51,  1.59s/it][A[A[A


917it [24:52,  1.46s/it][A[A[A


918it [24:55,  1.88s/it][A[A[A


919it [24:57,  1.86s/it][A[A[A


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)



921it [25:07,  3.66s/it][A[A[A


922it [25:09,  3.11s/it][A[A[A


923it [25:11,  3.00s/it][A[A[A


924it [25:13,  2.53s/it][A[A[A


925it [25:14,  2.18s/it][A[A[A


926it [25:15,  1.91s/it][

1085it [29:49,  1.41s/it][A[A[A


1086it [29:52,  1.65s/it][A[A[A


1087it [29:53,  1.45s/it][A[A[A








1090it [29:55,  1.65s/it][A[A[A


NameError: name 'avg_R2score' is not defined

### Rational Quadratic Kernel 