In [19]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer

from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, RBF, Matern, RationalQuadratic, Product, Sum, ExpSineSquared
from sklearn.model_selection import KFold

In [2]:
def impute_data(data):

    split = data.groupby(data.loc[:,"season"])

    imputer = KNNImputer()
    imputed_data = []
    
    groups = ["spring", "summer", "autumn", "winter"]
    for group in groups:

        full_group = split.get_group(group)
        season = full_group.loc[:,"season"] 
        group_to_impute = full_group.drop(['season'],axis=1)
        imputed_group = imputer.fit_transform(group_to_impute)
        imputed_group = pd.DataFrame(imputed_group, columns = group_to_impute.columns, index =  season.index)
        output_group = pd.concat([season, imputed_group], axis=1) 
        imputed_data.append(output_group)

            
    imputed_data = pd.concat(imputed_data)
    imputed_data = imputed_data.sort_index()

    return imputed_data

In [20]:
def data_loading():
    """
    This function loads the training and test data, preprocesses it, removes the NaN values and interpolates the missing 
    data using imputation

    Parameters
    ----------
    Returns
    ----------
    X_train: matrix of floats, training input with features
    y_train: array of floats, training output with labels
    X_test: matrix of floats: dim = (100, ?), test input with features
    """
    # Load training data
    train_df = pd.read_csv("train.csv")
    
    print("Training data:")
    print("Shape:", train_df.shape)
    print(train_df.head(2))
    print('\n')
    
    # Load test data
    test_df = pd.read_csv("test.csv")

    print("Test data:")
    print(test_df.shape)
    print(test_df.head(2))

    # Data preprocessing, imputation and extract X_train, y_train and X_test
    imputed_train = impute_data(train_df)
    imputed_test = impute_data(test_df)

    # Substitute seasons for values: 0 or 1

    encoder = OneHotEncoder(sparse=False)
    encoded_seasons = encoder.fit_transform(imputed_train['season'].values.reshape(-1, 1))
    encoded_seasons_test = encoder.transform(imputed_test['season'].values.reshape(-1, 1))
    encoded_seasons_df = pd.DataFrame(encoded_seasons, columns=encoder.get_feature_names_out(['season']))
    encoded_seasons_df_test = pd.DataFrame(encoded_seasons_test, columns=encoder.get_feature_names_out(['season']))
    imputed_train = pd.concat([encoded_seasons_df, imputed_train.drop('season', axis=1)], axis=1)
    imputed_test = pd.concat([encoded_seasons_df_test, imputed_test.drop('season', axis=1)], axis=1)

    X_train = imputed_train.drop(['price_CHF'],axis=1).to_numpy()
    y_train = imputed_train['price_CHF'].to_numpy()
    X_test = imputed_test.to_numpy()

    # scaler = StandardScaler()
    # X_train =  scaler.fit_transform(X_train)
    # X_test=  scaler.transform(X_test)

    #assert (X_train.shape[1] == X_test.shape[1]) and (X_train.shape[0] == y_train.shape[0]) and (X_test.shape[0] == 100), "Invalid data shape"
    return X_train, y_train, X_test

In [21]:
X_train, y_train, X_test = data_loading() 

Training data:
Shape: (900, 11)
   season  price_AUS  price_CHF  price_CZE  price_GER  price_ESP  price_FRA   
0  spring        NaN   9.644028  -1.686248  -1.748076  -3.666005        NaN  \
1  summer        NaN   7.246061  -2.132377  -2.054363  -3.295697  -4.104759   

   price_UK  price_ITA  price_POL  price_SVK  
0 -1.822720  -3.931031        NaN  -3.238197  
1 -1.826021        NaN        NaN  -3.212894  


Test data:
(100, 10)
   season  price_AUS  price_CZE  price_GER  price_ESP  price_FRA  price_UK   
0  spring        NaN   0.472985   0.707957        NaN  -1.136441 -0.596703  \
1  summer  -1.184837   0.358019        NaN  -3.199028  -1.069695       NaN   

   price_ITA  price_POL  price_SVK  
0        NaN   3.298693   1.921886  
1  -1.420091   3.238307        NaN  




In [29]:
# Functions for picking the best kernel by implementing cross validation

# MAY BE USE R SQUARED AS METRIC FOR CROSS VALIDATION INSTEAD OF RMSE?

def calculate_RMSE(y_truth, y_pred):
    
    RMSE=np.sqrt(np.sum((y_truth-y_pred)**2)/y_truth.shape[0])

    assert np.isscalar(RMSE)
    return RMSE


def average_LR_RMSE(X_train, y_train, kernels, n_folds):

    RMSE_mat = np.zeros((n_folds, len(kernels)))        
    kf=KFold(n_folds)

    for kf_i, (train_index, test_index) in enumerate(kf.split(X_train)):
        
        train_data=X_train[train_index,:]
        train_labels=y_train[train_index]
        test_data=X_train[test_index,:]
        test_labels=y_train[test_index]

        for k_i, k in enumerate(kernels):

            gpr = GaussianProcessRegressor(kernel=k) #n_restarts_optimizer = 10
            gpr.fit(train_data, train_labels)

            y_pred=gpr.predict(test_data)
            
            run_RMSE=calculate_RMSE(test_labels, y_pred)

            RMSE_mat[kf_i,k_i]=run_RMSE
    
    avg_RMSE = np.mean(RMSE_mat, axis=0)
    
    assert avg_RMSE.shape == (len(kernels),)
    return avg_RMSE

In [27]:
# ConvergenceWarning: The optimal value found for dimension 0 of parameter k1__length_scale is close to the specified lower bound 1e-05. Decreasing the bound and calling fit again may find a better value.
# Change the parameters to fit

def modeling_and_prediction(X_train, y_train, X_test):
    """
    This function defines the model, fits training data and then does the prediction with the test data 

    Parameters
    ----------
    X_train: matrix of floats, training input with 10 features
    y_train: array of floats, training output
    X_test: matrix of floats: dim = (100, ?), test input with 10 features

    Returns
    ----------
    y_test: array of floats: dim = (100,), predictions on test set
    """

    #TODO: Define the model and fit it using training data. Then, use test data to make predictions 
    kernels = [DotProduct(), RBF(), Matern(), RationalQuadratic(), Product(DotProduct(),RationalQuadratic()), Sum(RBF(),Matern()), Sum(DotProduct(), RationalQuadratic()), Sum(RBF(),RationalQuadratic()), Sum(Matern(),RationalQuadratic())]
    #kernels = [DotProduct(), RBF(), Matern(), RationalQuadratic()]
    #kernels = [ExpSineSquared(length_scale = 1.0, periodicity = 4)]

    n_folds = 10

    kernel_summary = average_LR_RMSE(X_train, y_train, kernels, n_folds)

    best_kernel = kernels[np.argmax(kernel_summary)]

    gpr = GaussianProcessRegressor(kernel=best_kernel)
    gpr.fit(X_train, y_train)
    y_pred = gpr.predict(X_test)

    assert y_pred.shape == (100,), "Invalid data shape"
    return y_pred, best_kernel, kernel_summary

In [28]:
y_pred, best_kernel, kernel_summary = modeling_and_prediction(X_train, y_train, X_test)



In [14]:
y_pred

array([-1.46818428,  1.79046239, -1.24501772, -0.72095117, -0.90512172,
        2.19252579,  0.78368047,  2.04138866,  0.3012183 ,  2.34136217,
        2.23466412,  2.97537568,  1.74043398,  3.13598542,  2.66889499,
        1.78282577,  1.64670858,  2.48934502,  2.03442861,  1.37016668,
        1.74027616,  2.45862436,  3.13167353,  3.00916411,  3.65571873,
        4.79455528,  5.70726227,  7.83764706,  8.94572208,  9.21937903,
        8.90993227,  7.8793992 ,  7.93313439,  7.45554671,  7.62334532,
        7.71400213,  7.52551312,  7.9736322 ,  7.96784803,  7.2759808 ,
        7.76364043,  7.897074  ,  7.93129556,  7.78180917,  7.76220976,
        8.14667474,  7.7083312 ,  8.14326486,  7.8862497 ,  7.87208075,
        7.60126754,  7.73096487,  7.80603012,  8.51799994,  7.78352523,
        9.06344245,  8.34138277,  8.29085505,  6.91779943,  6.35964953,
        6.94175796,  5.7026033 ,  5.37811966,  5.49091646,  5.20100524,
        5.27301013,  4.84302797,  4.73194778,  4.24511547,  5.30

In [17]:
y_pred

array([-1.51333532,  2.86859805, -1.5486578 , -0.33937393, -0.49528063,
        3.10390068,  0.57532579,  1.89730729,  0.22689235,  4.19678375,
        1.98248462,  2.95558527,  2.30719446,  4.47860594,  2.62200826,
        2.33137736,  1.36629058,  2.67343322,  2.38162737,  0.72343158,
        2.37624577,  2.10702976,  3.69116628,  3.44987299,  4.3522458 ,
        4.91240802,  5.44266864,  7.90020018,  8.93122756,  9.15141288,
        8.90437714,  7.92745998,  7.90415607,  7.52401819,  7.72123636,
        7.82271409,  7.51512778,  7.92518056,  8.12169506,  7.30911854,
        7.92025858,  7.78188048,  8.03377746,  7.89497639,  7.83980518,
        8.15226694,  7.691479  ,  8.14456114,  7.85979573,  7.82209812,
        7.70540805,  7.77079755,  7.77764972,  8.59331057,  7.85305483,
        8.99659893,  8.37325161,  8.15070318,  6.74971205,  6.3311147 ,
        6.95966169,  5.80729368,  5.72793613,  5.7699075 ,  5.28276161,
        5.29759637,  4.54689967,  4.44553486,  4.55452181,  5.32