In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, RBF, Matern, RationalQuadratic
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score

In [2]:
from tqdm import tqdm
import time

# Data Load 

In [13]:
def data_loading():
    """
    This function loads the training and test data, preprocesses it, removes the NaN values and interpolates the missing 
    data using imputation

    Parameters
    ----------
    Returns
    ----------
    X_train: matrix of floats, training input with features
    y_train: array of floats, training output with labels
    X_test: matrix of floats: dim = (100, ?), test input with features
    """
    # Load training data
    train_df = pd.read_csv("train.csv")
    
    print("Training data:")
    print("Shape:", train_df.shape)
    print(train_df.head(2))
    print('\n')
    
    # Load test data
    test_df = pd.read_csv("test.csv")

    print("Test data:")
    print(test_df.shape)
    print(test_df.head(2))

    # Dummy initialization of the X_train, X_test and y_train   
    X_train = np.zeros_like(train_df.drop(['price_CHF'],axis=1))
    y_train = np.zeros_like(train_df['price_CHF'])
    X_test = np.zeros_like(test_df)

    # TODO: Perform data preprocessing, imputation and extract X_train, y_train and X_test
    train_arr = np.array(train_df)
    test_arr = np.array(test_df)
    
    # one hot encoder
    train_arr = np.array(train_df)
    test_arr = np.array(test_df)
    
    # one hot encoder
    column_to_encode = 0
    train_to_encode = train_arr[:, column_to_encode].reshape(-1, 1)
    test_to_encode = test_arr[:, column_to_encode].reshape(-1, 1)
    encoder = OneHotEncoder()
    encoder.fit(train_to_encode)
    encoded_train_data = encoder.transform(train_to_encode)
    encoded_train_arr = np.concatenate((train_arr[:, :column_to_encode],
                            encoded_train_data.toarray(),
                            train_arr[:, column_to_encode+1:]), axis=1)
    
    encoded_test_data = encoder.transform(test_to_encode)
    encoded_test_arr = np.concatenate((test_arr[:, :column_to_encode],
                            encoded_test_data.toarray(),
                            test_arr[:, column_to_encode+1:]),axis=1)
    
    # KNN Imputer
    imputer = KNNImputer(n_neighbors=10, weights = 'distance')
    imputed_train = imputer.fit_transform(encoded_train_arr)
    imputed_test = imputer.fit_transform(encoded_test_arr)
    
    # delete the price of electric of swi
    train_idx = [i for i in range(encoded_train_arr.shape[0]) if np.isnan(encoded_train_arr[i,5]) == False] 
    imputed_train_refined = imputed_train[train_idx]
    
    X_train = np.delete(imputed_train_refined, 5, 1)
    y_train = imputed_train_refined[:, 5]
    
    X_test = imputed_test
    
    assert (X_train.shape[1] == X_test.shape[1]) and (X_train.shape[0] == y_train.shape[0]) and (X_test.shape[0] == 100), "Invalid data shape"
    return X_train, y_train, X_test

In [14]:
X_train, y_train, X_test = data_loading()

Training data:
Shape: (900, 11)
   season  price_AUS  price_CHF  price_CZE  price_GER  price_ESP  price_FRA  \
0  spring        NaN   9.644028  -1.686248  -1.748076  -3.666005        NaN   
1  summer        NaN   7.246061  -2.132377  -2.054363  -3.295697  -4.104759   

   price_UK  price_ITA  price_POL  price_SVK  
0 -1.822720  -3.931031        NaN  -3.238197  
1 -1.826021        NaN        NaN  -3.212894  


Test data:
(100, 10)
   season  price_AUS  price_CZE  price_GER  price_ESP  price_FRA  price_UK  \
0  spring        NaN   0.472985   0.707957        NaN  -1.136441 -0.596703   
1  summer  -1.184837   0.358019        NaN  -3.199028  -1.069695       NaN   

   price_ITA  price_POL  price_SVK  
0        NaN   3.298693   1.921886  
1  -1.420091   3.238307        NaN  


# Model Test and Optimize

## Kernel Optimization 

### Matern Kernel 

In [10]:
ls_range = np.logspace(-5, 1, 100, base=10)
nu_range = np.logspace(-5, 1, 100, base=10)

In [16]:
params = [  (0.05, 0.5), (0.05, 1.5), (0.05, 2.5),
                (0.1, 0.5), (0.1, 1.5), (0.1, 2.5),
                (1, 0.5), (1, 1.5), (1, 2.5)]
for i in range (len(ls_range)):
    for j in range (len(nu_range)):
        params.append((ls_range[i], nu_range[j]))

In [9]:
def Matern_para_select(X, y, n_folds):
    print("————Matern————")
    ls_range = np.logspace(-2, 0.5, 10, base=10)
    nu_range = np.logspace(-2, 0.5, 10, base=10)
    params = [  (0.05, 0.5), (0.05, 1.5), (0.05, 2.5),
                (0.1, 0.5), (0.1, 1.5), (0.1, 2.5),
                (1, 0.5), (1, 1.5), (1, 2.5)]
#     for i in range (len(ls_range)):
#         for j in range (len(nu_range)):
#             params.append((ls_range[i], nu_range[j]))
    pbar = tqdm(total=len(params)*n_folds)
    R2score_mat = np.zeros((n_folds, len(params)))
    kf = KFold(n_splits=n_folds, shuffle=True, random_state= 14)
    for fold_idx, (train, test) in enumerate(kf.split(X)):
        X_train, X_val, y_train, y_val = X[train], X[test], y[train], y[test]
        for idx, (ls, nu) in enumerate(params):
            gpr = GaussianProcessRegressor(kernel= Matern(length_scale=ls, nu=nu))
            gpr.fit(X_train, y_train)
            y_val_pred = gpr.predict(X_val)
            R2score_mat[fold_idx][idx] = r2_score(y_val, y_val_pred)
            pbar.update()
    pbar.close()
    
    avg_R2score = np.mean(R2score_mat, axis=0)
    print("Best Param: ", params[np.argmax([avg_R2score])], " Score: ", avg_R2score[np.argmax([avg_R2score])])
    print(avg_R2score)
    print("______________end_____________")
    return avg_R2score, params[np.argmax([avg_R2score])]

####  KNN 15 with uniform weight

In [56]:
# params search
n_folds = 10
avg_R2score_M, best_para_M = Matern_para_select(X_train, y_train, n_folds)




  0%|                                                  | 0/1090 [00:00<?, ?it/s][A[A[A


  0%|                                          | 2/1090 [00:00<01:32, 11.81it/s][A[A[A

————Matern————





  0%|                                          | 3/1090 [00:00<01:47, 10.12it/s][A[A[A


  0%|▏                                         | 5/1090 [00:00<01:41, 10.69it/s][A[A[A


  1%|▎                                         | 7/1090 [00:00<01:35, 11.35it/s][A[A[A


  1%|▎                                         | 9/1090 [00:00<01:29, 12.02it/s][A[A[A


  1%|▍                                        | 11/1090 [00:03<09:02,  1.99it/s][A[A[A


  1%|▍                                        | 12/1090 [00:05<15:12,  1.18it/s][A[A[A


  1%|▍                                        | 13/1090 [00:06<17:07,  1.05it/s][A[A[A


  1%|▌                                        | 14/1090 [00:09<28:39,  1.60s/it][A[A[A


  1%|▌                                        | 15/1090 [00:12<32:45,  1.83s/it][A[A[A


  1%|▌                                        | 16/1090 [00:12<24:16,  1.36s/it][A[A[A


  2%|▋                                        | 17/1090 [00:12<18:25,  1.03s/

 15%|██████                                  | 166/1090 [04:23<25:59,  1.69s/it][A[A[A


 15%|██████▏                                 | 167/1090 [04:25<27:35,  1.79s/it][A[A[A


 15%|██████▏                                 | 168/1090 [04:29<37:48,  2.46s/it][A[A[A


 16%|██████▏                                 | 169/1090 [04:31<33:15,  2.17s/it][A[A[A


 16%|██████▏                                 | 170/1090 [04:32<31:36,  2.06s/it][A[A[A


 16%|██████▎                                 | 171/1090 [04:34<30:00,  1.96s/it][A[A[A


 16%|██████▎                                 | 172/1090 [04:35<26:59,  1.76s/it][A[A[A


 16%|██████▎                                 | 173/1090 [04:37<24:43,  1.62s/it][A[A[A


 16%|██████▍                                 | 174/1090 [04:38<24:06,  1.58s/it][A[A[A


 16%|██████▍                                 | 175/1090 [04:40<24:07,  1.58s/it][A[A[A


 16%|██████▍                                 | 176/1090 [04:42<28:47,  1.89s/it]




 30%|████████████                            | 327/1090 [10:00<11:31,  1.10it/s][A[A[A


 30%|████████████                            | 329/1090 [10:00<08:30,  1.49it/s][A[A[A


 30%|████████████                            | 330/1090 [10:01<06:33,  1.93it/s][A[A[A


 30%|████████████▏                           | 331/1090 [10:01<04:58,  2.54it/s][A[A[A


 30%|████████████▏                           | 332/1090 [10:01<03:59,  3.17it/s][A[A[A


 31%|████████████▏                           | 333/1090 [10:01<03:17,  3.83it/s][A[A[A


 31%|████████████▎                           | 335/1090 [10:01<02:40,  4.69it/s][A[A[A


 31%|████████████▎                           | 337/1090 [10:03<05:11,  2.41it/s][A[A[A


 31%|████████████▍                           | 338/1090 [10:05<10:37,  1.18it/s][A[A[A


 31%|████████████▍                           | 339/1090 [10:07<16:38,  1.33s/it][A[A[A


 31%|████████████▍                           | 340/1090 [10:09<18:53,  1.51s/

 44%|█████████████████▌                      | 480/1090 [15:51<38:16,  3.76s/it][A[A[A


 44%|█████████████████▋                      | 481/1090 [15:53<31:11,  3.07s/it][A[A[A


 44%|█████████████████▋                      | 482/1090 [15:55<28:20,  2.80s/it][A[A[A


 44%|█████████████████▋                      | 483/1090 [15:57<25:26,  2.51s/it][A[A[A


 44%|█████████████████▊                      | 484/1090 [16:00<26:58,  2.67s/it][A[A[A


 44%|█████████████████▊                      | 485/1090 [16:02<25:55,  2.57s/it][A[A[A


 45%|█████████████████▊                      | 486/1090 [16:04<25:28,  2.53s/it][A[A[A


 45%|█████████████████▊                      | 487/1090 [16:07<24:36,  2.45s/it][A[A[A


 45%|█████████████████▉                      | 488/1090 [16:09<22:47,  2.27s/it][A[A[A


 45%|█████████████████▉                      | 489/1090 [16:11<22:07,  2.21s/it][A[A[A


 45%|█████████████████▉                      | 490/1090 [16:13<23:18,  2.33s/it]

 59%|███████████████████████▋                | 647/1090 [21:56<16:55,  2.29s/it][A[A[A


 59%|███████████████████████▊                | 648/1090 [21:58<16:13,  2.20s/it][A[A[A


 60%|███████████████████████▊                | 649/1090 [22:00<14:23,  1.96s/it][A[A[A


 60%|███████████████████████▊                | 650/1090 [22:02<14:45,  2.01s/it][A[A[A


 60%|███████████████████████▉                | 651/1090 [22:03<12:53,  1.76s/it][A[A[A








 60%|████████████████████████                | 654/1090 [22:06<07:37,  1.05s/it][A[A[A


 60%|████████████████████████                | 655/1090 [22:06<05:32,  1.31it/s][A[A[A


 60%|████████████████████████                | 656/1090 [22:06<04:15,  1.70it/s][A[A[A


 60%|████████████████████████                | 657/1090 [22:06<03:22,  2.14it/s][A[A[A


 60%|████████████████████████▏               | 658/1090 [22:06<02:35,  2.77it/s][A[A[A


 60%|████████████████████████▏               | 659/1090 [22:06<02:03,  3.4

 73%|█████████████████████████████           | 793/1090 [27:19<17:35,  3.55s/it][A[A[A


 73%|█████████████████████████████▏          | 794/1090 [27:22<16:57,  3.44s/it][A[A[A


 73%|█████████████████████████████▏          | 795/1090 [27:24<15:17,  3.11s/it][A[A[A


 73%|█████████████████████████████▏          | 796/1090 [27:26<12:58,  2.65s/it][A[A[A


 73%|█████████████████████████████▏          | 797/1090 [27:28<11:44,  2.41s/it][A[A[A


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)



 73%|█████████████████████████████▎          | 799/1090 [27:37<19:31,  4.03s/it][A[A[A


 73%|█████████████████████████████▎          | 800/1090 [27:41<19:16,  3.99s/it][A[A[A


 73%|█████████████████████████████▍          | 801/1090 [27:44<17:18,  3.59s/it][A[A[A


 74%|█████████████████████████████▍          |

 87%|██████████████████████████████████▊     | 947/1090 [33:22<05:51,  2.46s/it][A[A[A


 87%|██████████████████████████████████▊     | 948/1090 [33:24<05:31,  2.33s/it][A[A[A


 87%|██████████████████████████████████▊     | 949/1090 [33:25<05:03,  2.15s/it][A[A[A


 87%|██████████████████████████████████▊     | 950/1090 [33:28<05:10,  2.22s/it][A[A[A


 87%|██████████████████████████████████▉     | 951/1090 [33:31<06:06,  2.64s/it][A[A[A


 87%|██████████████████████████████████▉     | 952/1090 [33:33<05:33,  2.42s/it][A[A[A


 87%|██████████████████████████████████▉     | 953/1090 [33:36<05:28,  2.40s/it][A[A[A


 88%|███████████████████████████████████     | 954/1090 [33:38<05:22,  2.37s/it][A[A[A


 88%|███████████████████████████████████     | 955/1090 [33:41<06:03,  2.69s/it][A[A[A


 88%|███████████████████████████████████     | 956/1090 [33:44<06:10,  2.77s/it][A[A[A


 88%|███████████████████████████████████     | 957/1090 [33:46<05:42,  2.57s/it]

Best Param:  (0.464158883361278, 0.464158883361278)  Score:  0.9831558640535644
[ 0.98314719  0.97809316  0.97068473  0.98314719  0.97809316  0.97068473
  0.98314719  0.97809316  0.97068473  0.90828893  0.94051309  0.95981352
  0.97146949  0.97822989  0.98187398 -1.91659259 -1.91659268 -1.91659268
 -1.91659268  0.90828894  0.94051309  0.95981352  0.97146949  0.97822989
  0.98187399  0.98315586  0.98187171  0.9768908   0.09549496  0.90828894
  0.94051309  0.95981352  0.97146949  0.97822989  0.98187398  0.98315585
  0.98187175  0.67057343  0.96541551  0.90828894  0.94051309  0.95981352
  0.97146949  0.97822989  0.98187398  0.98315586  0.98187171  0.97689256
  0.96611904  0.90828894  0.94051309  0.95981352  0.97146949  0.97822989
  0.98187399  0.98315585  0.98187172  0.9768926   0.96611897  0.90828894
  0.9405131   0.95981352  0.97146949  0.97822989  0.98187398  0.98315586
  0.98187172  0.97689257  0.96611903  0.90828893  0.94051309  0.95981352
  0.97146949  0.97822989  0.98187399  0.9831




In [6]:
gpr = GaussianProcessRegressor(kernel=Matern(length_scale=0.464158883361278, nu=0.464158883361278))
gpr.fit(X_train, y_train)
y_pred = gpr.predict(X_test)

In [7]:
# Save results in the required format
dt = pd.DataFrame(y_pred) 
dt.columns = ['price_CHF']
dt.to_csv('results.csv', index=False)

####  KNN 15 with distance  weight

In [17]:
n_folds = 10
avg_R2score_M, best_para_M = Matern_para_select(X_train, y_train, n_folds)

  2%|▉                                           | 2/90 [00:00<00:07, 11.33it/s]

————Matern————


100%|███████████████████████████████████████████| 90/90 [00:08<00:00, 10.09it/s]

Best Param:  (1, 0.5)  Score:  0.9846504913006238
[0.98465049 0.98059944 0.97467688 0.98465049 0.98059944 0.97467688
 0.98465049 0.98059944 0.97467688]
______________end_____________





In [11]:
gpr = GaussianProcessRegressor(kernel=Matern(length_scale=0.464158883361278, nu=0.464158883361278))
gpr.fit(X_train, y_train)
y_pred = gpr.predict(X_test)

In [15]:
gpr = GaussianProcessRegressor(kernel=Matern(length_scale=1, nu=0.5))
gpr.fit(X_train, y_train)
y_pred = gpr.predict(X_test)

In [16]:
# Save results in the required format
dt = pd.DataFrame(y_pred) 
dt.columns = ['price_CHF']
dt.to_csv('results.csv', index=False)