In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, RBF, Matern, RationalQuadratic, WhiteKernel
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.impute import KNNImputer

In [2]:
# Loads the data and imputes the NA values using two different imputing techniques

def data_loading():

    # Load training data
    train_df = pd.read_csv("train.csv")

    test_df = pd.read_csv("test.csv")
    test_df_colnames = test_df.columns[1:].to_list()

    #drop priceCHF into y_train
    y_train = train_df['price_CHF']
    train_df = train_df.drop(['price_CHF'],axis=1)

    #scale train and test with the same scaler
    scaler = StandardScaler()

    #One Hot encoding of seasons
    encoder = OneHotEncoder(sparse=False)
    encoded_seasons = encoder.fit_transform(train_df['season'].values.reshape(-1, 1))
    encoded_seasons_test = encoder.transform(test_df['season'].values.reshape(-1, 1))
    encoded_seasons_df = pd.DataFrame(encoded_seasons, columns=encoder.get_feature_names_out(['season']))
    encoded_seasons_df_test = pd.DataFrame(encoded_seasons_test, columns=encoder.get_feature_names_out(['season']))
    train_df = pd.concat([encoded_seasons_df, pd.DataFrame(scaler.fit_transform(train_df.drop('season', axis=1)))], axis=1)
    test_df = pd.concat([encoded_seasons_df_test, pd.DataFrame(scaler.transform(test_df.drop('season', axis=1)))], axis=1)

    colnames_test = test_df.columns[:4]
    colnames_list = colnames_test.to_list()
    colnames_list = colnames_list + test_df_colnames

    train_df.columns = colnames_list
    colnames_list_2 = colnames_list + ['price_CHF']
    
    #imputing training set
    imp = IterativeImputer(max_iter=1000, random_state=0)
    imp.fit(train_df)
    imp_df = pd.DataFrame(imp.transform(train_df))
    imp_df.columns = colnames_list
    imp_df_y = pd.concat([imp_df, y_train], axis=1)
    imputer = KNNImputer()
    imp_df_y = pd.DataFrame(imputer.fit_transform(imp_df_y))
    imp_df_y.columns = colnames_list_2
    
    #imputing test set
    test_df.columns = colnames_list
    imp_test_df = pd.DataFrame(imp.transform(test_df))
    imp_test_df.columns = colnames_list

    X_train = imp_df_y.drop(['price_CHF'],axis=1).to_numpy()
    y_train = imp_df_y['price_CHF'].to_numpy()
    X_test = imp_test_df

    assert (X_train.shape[1] == X_test.shape[1]) and (X_train.shape[0] == y_train.shape[0]) and (X_test.shape[0] == 100), "Invalid data shape"
    return X_train, y_train, X_test

In [3]:
# Functions for picking the best kernel by implementing cross validation from Project 1
## Crossvalidation is done to determine the best kernel
## Calculate RMSE for each of the 10 runs of crossvalidation and then compute the average, which corresponds to each kernel

def calculate_RMSE(y_truth, y_pred):
    
    RMSE=np.sqrt(np.sum((y_truth-y_pred)**2)/y_truth.shape[0])

    assert np.isscalar(RMSE)
    return RMSE


def average_LR_RMSE(X_train, y_train, kernels, n_folds):

    RMSE_mat = np.zeros((n_folds, len(kernels)))        
    kf=KFold(n_folds)

    for kf_i, (train_index, test_index) in enumerate(kf.split(X_train)):
        
        train_data=X_train[train_index,:]
        train_labels=y_train[train_index]
        test_data=X_train[test_index,:]
        test_labels=y_train[test_index]

        for k_i, k in enumerate(kernels):

            gpr = GaussianProcessRegressor(kernel=k) #n_restarts_optimizer = 10
            gpr.fit(train_data, train_labels)

            y_pred=gpr.predict(test_data)
            
            run_RMSE=calculate_RMSE(test_labels, y_pred)

            RMSE_mat[kf_i,k_i]=run_RMSE
    
    avg_RMSE = np.mean(RMSE_mat, axis=0)
    
    assert avg_RMSE.shape == (len(kernels),)
    return avg_RMSE

In [4]:
# Function that uses cross validation to test for different kernels, selects the best one and based on that does the predictions
def modeling_and_prediction(X_train, y_train, X_test):
    """
    This function defines the model, fits training data and then does the prediction with the test data 

    Parameters
    ----------
    X_train: matrix of floats, training input with 10 features
    y_train: array of floats, training output
    X_test: matrix of floats: dim = (100, ?), test input with 10 features

    Returns
    ----------
    y_test: array of floats: dim = (100,), predictions on test set
    """

    #TODO: Define the model and fit it using training data. Then, use test data to make predictions 
    kernels = [DotProduct(), RBF(), Matern(), RationalQuadratic(), RationalQuadratic() + WhiteKernel(noise_level=0.05)]
    n_folds = 10

    kernel_summary = average_LR_RMSE(X_train, y_train, kernels, n_folds)

    best_kernel = kernels[np.argmin(kernel_summary)]

    gpr = GaussianProcessRegressor(kernel=best_kernel)
    gpr.fit(X_train, y_train)
    y_pred = gpr.predict(X_test)

    assert y_pred.shape == (100,), "Invalid data shape"
    return y_pred

In [5]:
# Main function
if __name__ == "__main__":
    # Data loading
    X_train, y_train, X_test = data_loading()
    # The function retrieving optimal LR parameters
    y_pred=modeling_and_prediction(X_train, y_train, X_test)
    # Save results in the required format
    dt = pd.DataFrame(y_pred) 
    dt.columns = ['price_CHF']
    dt.to_csv('results_final.csv', index=False)
    print("\nResults file successfully generated!")




Results file successfully generated!


