In [37]:
# First, we import necessary libraries:
import numpy as np
import pandas as pd

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, RBF, Matern, RationalQuadratic
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score


In [38]:
def seasons_to_val(s):
    """Converts season string to numerical value"""
    season_map = {"spring": 1, "summer": 2, "autumn": 3, "winter": 4}
    return season_map.get(s, 0)


In [39]:
def data_loading(KNN):
    """
    This function loads the training and test data, preprocesses it, removes the NaN values and interpolates the missing 
    data using imputation

    Returns
    ----------
    X_train: matrix of floats, training input with features
    y_train: array of floats, training output with labels
    X_test: matrix of floats: dim = (100, ?), test input with features
    """
    # Load training data
    train_df = pd.read_csv("/home/otps3141/Documents/Git Hub/IML-2023/P2/train.csv")

    # print("Training data:")
    # print("Shape:", train_df.shape)
    # print(train_df.head(2))
    # print('\n')

    # Load test data
    test_df = pd.read_csv("/home/otps3141/Documents/Git Hub/IML-2023/P2/test.csv")

    # print("Test data:")
    # print(test_df.shape)
    # print(test_df.head(2))

    # Extract features and labels
    ### Maybe use to_numpy() instead of values

    X_train = train_df.drop(['price_CHF'], axis=1).values
    y_train = train_df['price_CHF'].values
    X_test = test_df.values

    # Preprocess data
    X_train[:, 0] = np.array(list(map(seasons_to_val, X_train[:, 0])))
    X_test[:, 0] = np.array(list(map(seasons_to_val, X_test[:, 0])))

    # Impute missing values

    ### 1. KNN
    imp = KNNImputer(n_neighbors=KNN, weights="distance")
    X_train = imp.fit_transform(X_train)
    X_test = imp.transform(X_test)

    ### 2. Iterative
    # imp = IterativeImputer(random_state=0)
    # X_train = imp.fit_transform(X_train)
    # X_test = imp.transform(X_test)

    ### How to deal with missing labels? Could sort them out or impute them as well.
    # Remove rows with missing labels

    mask = ~np.isnan(y_train)
    X_train = X_train[mask]
    y_train = y_train[mask]

    sum = np.sum(mask)

    # Impute labels
    #--> Performs worse

    # y_train = imp.fit_transform(y_train.reshape(-1,1))

    assert (X_train.shape[1] == X_test.shape[1]) and (
        X_train.shape[0] == y_train.shape[0]) and (X_test.shape[0] == 100), "Invalid data shape"
    return X_train, y_train, X_test, sum

In [40]:
def _fit_kfold(kf_splitter, X_test, X_train, y_train, model):
    """"
    Helper function to compute the KFolds and keep the rest of the modeling function cleaner.
    The model is the machine learning model that fits the data.
    """
    # Storage for errors
    error_matrix = np.zeros(kf_splitter.get_n_splits())

    # Storage for all n_fold models
    models = []

    # Perform Cross validation
    for i, (train_index, test_index) in enumerate(kf_splitter.split(X_train)):
        X_train_folds = X_train[train_index]
        y_train_folds = y_train[train_index]

        X_test_folds = X_train[test_index]
        y_test_folds = y_train[test_index]

        # The model is refitted every time.
        model.fit(X_train_folds, y_train_folds)
        y_pred_folds, sigma = model.predict(X_test_folds, return_std=True)

        models.append(model)

        error_matrix[i] = calculate_R2(y_pred_folds, y_test_folds)

    best_index = np.argmax(error_matrix)
    best_fit = models[best_index]

    y_pred = best_fit.predict(X_test)

    return y_pred, error_matrix

In [41]:
def calculate_R2(y_pred, y):

    R2 = r2_score(y, y_pred)

    assert np.isscalar(R2)
    return R2

In [42]:
def modeling_and_prediction(X_train, y_train, X_test, N_folds):
    """
    This function defines the model, fits training data and then does the prediction with the test data 

    Parameters
    ----------
    X_train: matrix of floats, training input with 10 features
    y_train: array of floats, training output
    X_test: matrix of floats: dim = (100, ?), test input with 10 features

    Returns
    ----------
    y_test: array of floats: dim = (100,), predictions on test set
    """

    kf = KFold(N_folds)

    y_pred = np.zeros(X_test.shape[0])
    # TODO: Define the model and fit it using training data. Then, use test data to make predictions
    gauss = GaussianProcessRegressor(kernel=RationalQuadratic(),random_state = 0, alpha = 1)
    # gauss.fit(X_train, y_train)
    # y_pred = gaus.predict(X_test)


    y_pred, error_matrix = _fit_kfold(kf, X_test, X_train, y_train, gauss)


    assert y_pred.shape == (100,), "Invalid data shape"
    return y_pred, error_matrix

In [43]:
# Data loading
X_train, y_train, X_test, sum = data_loading(KNN = 4)
# The function retrieving optimal LR parameters
y_pred, error_matrix = modeling_and_prediction(X_train, y_train, X_test, 8)
# Save results in the required format
dt = pd.DataFrame(y_pred)
dt.columns = ['price_CHF']
dt.to_csv('/home/otps3141/Documents/Git Hub/IML-2023/P2/results.csv', index=False)
print("\nResults file successfully generated!")

print(900 - sum)




Results file successfully generated!
269


In [44]:
error_matrix

array([0.96830965, 0.93964835, 0.97473372, 0.85271256, 0.96910919,
       0.93146688, 0.88290832, 0.95371487])