## Libraries
Libraries being used in this notebook.

In [33]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import CompoundKernel, ConstantKernel, DotProduct, ExpSineSquared, Exponentiation, Hyperparameter, Kernel, Matern, PairwiseKernel, Product, RationalQuadratic, RBF, Sum, WhiteKernel

## Method `data_loading()`
First, we load the data from the csv file and put into a pandas dataframe. After that the data is preprocessed by replacing the categorical values in the `season`column with humerical values. After that we are using the K-Nearest Neighbor (KNN) imuter so that we can fill in the missing values. This is done by first fiting the imputer on the training data and then later transforms both the training and the test data using the `transform`method. At last we are splitting the data into the features and the target variable.

In [57]:
def data_loading():
    """
    This function loads the training and test data, preprocesses it, removes the NaN values and interpolates the missing 
    data using imputation

    Parameters
    ----------
    Returns
    ----------
    X_train: matrix of floats, training input with features
    y_train: array of floats, training output with labels
    X_test: matrix of floats: dim = (100, ?), test input with features
    """
    # Load training data
    train_df = pd.read_csv("train.csv")
    
    
    # Load test data
    test_df = pd.read_csv("test.csv")

    # # Dummy initialization of the X_train, X_test and y_train   
    # X_train = np.zeros_like(train_df.drop(['price_CHF'],axis=1))
    # y_train = np.zeros_like(train_df['price_CHF'])
    # X_test = np.zeros_like(test_df)

    #Performing data preprocessing, imputation and extract X_train, y_train and X_test

    X_train = train_df.drop(['price_CHF'], axis=1)
    X_train = X_train.replace({'spring':'0', "summer":"1", "autumn":"2", "winter":"3"})

    X_test = test_df.replace({'spring':'0', "summer":"1", "autumn":"2", "winter":"3"})
    train_df_replaced = train_df.replace({'spring':'0', "summer":"1", "autumn":"2", "winter":"3"})

    imputer = KNNImputer(n_neighbors=2)
    imputer.fit(X_train)
    X_train = imputer.transform(X_train)
    X_test = imputer.transform(X_test)

    imputer2 = KNNImputer(n_neighbors=2)
    imputer2.fit(train_df_replaced)
    y_train = imputer2.transform(train_df_replaced)
    y_train = np.asarray(y_train)
    y_train = y_train[:, 2]

    assert (X_train.shape[1] == X_test.shape[1]) and (X_train.shape[0] == y_train.shape[0]) and (X_test.shape[0] == 100), "Invalid data shape"
    return X_train, y_train, X_test

## Method `modeling_and_prediction`
This method is taking 3 arguments which are `X_train`, `y_traing`and `X_test`. Initially I tried using linear regression, but that gave me a very low accuracy score. Instead as sugested in the assignment I used the Guassian Naive Bayes Model to fit the training data and then to make the prediciton on the test data. I tried multiple different kernal, but the best one which I got was the RationalQuadratic which my partener figured out.

In [58]:
def modeling_and_prediction(X_train, y_train, X_test):
    """
    This function defines the model, fits training data and then does the prediction with the test data 

    Parameters
    ----------
    X_train: matrix of floats, training input with 10 features
    y_train: array of floats, training output
    X_test: matrix of floats: dim = (100, ?), test input with 10 features

    Returns
    ----------
    y_test: array of floats: dim = (100,), predictions on test set
    """

    y_pred=np.zeros(X_test.shape[0])

    ## Defining the model and fit it using training data. Then, use test data to make predictions

    ## Trying out different kernels
    # model = GaussianProcessRegressor(kernel=ConstantKernel())
    # model = GaussianProcessRegressor(kernel=DotProduct())
    # model = GaussianProcessRegressor(kernel=ExpSineSquared())
    # model = GaussianProcessRegressor(kernel=Exponentiation())
    # model = GaussianProcessRegressor(kernel=Hyperparameter())
    # model = GaussianProcessRegressor(kernel=Kernel())
    # model = GaussianProcessRegressor(kernel=Matern())
    # model = GaussianProcessRegressor(kernel=PairwiseKernel())
    # model = GaussianProcessRegressor(kernel=Product())
    model = GaussianProcessRegressor(kernel=RationalQuadratic()) # Best kernel
    # model = GaussianProcessRegressor(kernel=RBF())
    # model = GaussianProcessRegressor(kernel=Sum())
    # model = GaussianProcessRegressor(kernel=WhiteKernel())

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    assert y_pred.shape == (100,), "Invalid data shape"
    return y_pred

In [59]:
if __name__ == "__main__":
    # Data loading
    X_train, y_train, X_test = data_loading()
    # The function retrieving optimal LR parameters
    y_pred=modeling_and_prediction(X_train, y_train, X_test)
    # Save results in the required format
    dt = pd.DataFrame(y_pred) 
    dt.columns = ['price_CHF']
    dt.to_csv('results.csv', index=False)
    print("\nResults file successfully generated!")



Results file successfully generated!
