# Task 2
This serves as a template which will guide you through the implementation of this task. It is advised to first read the whole template and get a sense of the overall structure of the code before trying to fill in any of the TODO gaps.
This is the jupyter notebook version of the template. For the python file version, please refer to the file `template_solution.py`.

First, we import necessary libraries:

In [2]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder
from fancyimpute import KNN, NuclearNormMinimization, SoftImpute, BiScaler
import inspect
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, RBF, Matern, RationalQuadratic, Sum, Product, WhiteKernel, ExpSineSquared, ConstantKernel
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
# Add any other imports you need here

# Data Loading
TODO: Perform data preprocessing, imputation and extract X_train, y_train and X_test
(and potentially change initialization of variables to accomodate how you deal with non-numeric data)

In [3]:
"""
This loads the training and test data, preprocesses it, removes the NaN
values and interpolates the missing data using imputation

Parameters
----------
Compute
----------
X_train: matrix of floats, training input with features
y_train: array of floats, training output with labels
X_test: matrix of floats: dim = (100, ?), test input with features
"""
# Load training data
train_df = pd.read_csv("train.csv")
# print("Training data:")
# print("Shape:", train_df.shape)
# print(train_df.head(5))
# print('\n')

# Load test data
test_df = pd.read_csv("test.csv")
# print("Test data:")
# print(test_df.shape)
# print(test_df.head(5))
# print('\n')

""" 
One-Hot Encoding
"""
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(train_df[['season']]))
OH_cols_train.index = train_df.index
OH_cols_train.columns = OH_encoder.get_feature_names_out(['season'])
# concatenate the new OneHotEncoded columns to the old ones 
leftover_cols_train = train_df.drop(['season'], axis=1)
OH_X_train = pd.concat([OH_cols_train, leftover_cols_train], axis=1)
OH_X_train.columns = OH_X_train.columns.astype(str)
# print(OH_X_train.head(5))

OH_cols_test = pd.DataFrame(OH_encoder.fit_transform(test_df[['season']]))
OH_cols_test.index = test_df.index
OH_cols_test.columns = OH_encoder.get_feature_names_out(['season'])
# concatenate the new OneHotEncoded columns to the old ones 
leftover_cols_test = test_df.drop(['season'], axis=1)
OH_X_test = pd.concat([OH_cols_test, leftover_cols_test], axis=1)
OH_X_test.columns = OH_X_test.columns.astype(str)
# print(OH_X_test.head(5))

# train_df = OH_X_train
# test_df = OH_X_test


"""
MICE
"""
# Impute Training Data with MICE
imp = IterativeImputer(max_iter=10, random_state=0)
imputed_train_df = imp.fit_transform(train_df.select_dtypes(include=[np.number]))
imputed_train_df = pd.DataFrame(imputed_train_df, columns=train_df.select_dtypes(include=[np.number]).columns)
# print("MICE Training data:")
# print(imputed_train_df.head(5))
# print('\n')

# Impute Test Data with MICE
imputed_test_df = imp.fit_transform(test_df.select_dtypes(include=[np.number]))
imputed_test_df = pd.DataFrame(imputed_test_df, columns=test_df.select_dtypes(include=[np.number]).columns)
# print("MICE Testing data:")
# print(imputed_test_df.head(5))
# print('\n')

""" 
KNN
"""
# Impute Training Data with KNN
KNN_imp = KNNImputer(n_neighbors=3, weights="uniform")
KNN_imputed_train_df = KNN_imp.fit_transform(train_df.select_dtypes(include=[np.number]))
KNN_imputed_train_df = pd.DataFrame(KNN_imputed_train_df, columns=train_df.select_dtypes(include=[np.number]).columns)
# print("KNN Training data:")
# print(KNN_imputed_train_df.head(5))
# print('\n')

# Impute Test Data with KNN
KNN_imputed_test_df = KNN_imp.fit_transform(test_df.select_dtypes(include=[np.number]))
KNN_imputed_test_df = pd.DataFrame(KNN_imputed_test_df, columns=test_df.select_dtypes(include=[np.number]).columns)
# print("KNN Test data:")
# print(KNN_imputed_test_df.head(5))
# print('\n')

"""
Transfer Imputed Data into Numpy
"""
# TODO: Perform data preprocessing, imputation and extract X_train, y_train and X_test
def extract_params(imputed_traindata_df, imputed_testdata_df):
    frame = inspect.currentframe()
    try:
        callers_local_vars = frame.f_back.f_locals.items()
        df_name = next(name for name, val in callers_local_vars if val is imputed_traindata_df)
        print("Imputed Data used: ", df_name)
    finally:
        del frame  # To avoid reference cycles
        
    y_train = imputed_traindata_df['price_CHF'].to_numpy()  # Convert 'price_CHF' column to numpy array
    X_train = imputed_traindata_df.drop('price_CHF', axis=1).to_numpy()  # Convert remaining columns to numpy array
    X_test = imputed_testdata_df.to_numpy()

    return y_train, X_train, X_test

y_train, X_train, X_test = extract_params(KNN_imputed_train_df, KNN_imputed_test_df)                                     
print("X_train:",  X_train.shape)
print("y_train:", y_train.shape)
print("X_test:", X_test.shape)
print("\n")

assert (X_train.shape[1] == X_test.shape[1]) and (X_train.shape[0] == y_train.shape[0]) and (X_test.shape[0] == 100), "Invalid data shape"



Imputed Data used:  KNN_imputed_train_df
X_train: (900, 9)
y_train: (900,)
X_test: (100, 9)




# Modeling and Prediction
TODO: Define the model and fit it using training data. Then, use test data to make predictions

In [4]:
"""
This defines the model, fits training data and then does the prediction
with the test data 

Parameters
----------
X_train: matrix of floats, training input with 10 features
y_train: array of floats, training output
X_test: matrix of floats: dim = (100, ?), test input with 10 features

Compute
----------
y_test: array of floats: dim = (100,), predictions on test set
"""
#TODO: Define the model and fit it using training data. Then, use test data to make predictions

gpr = GaussianProcessRegressor(kernel=Sum(Matern(), WhiteKernel()))
gpr.fit(X_train, y_train)

y_pred=gpr.predict(X_test)
print("y_pred", y_pred.shape)
print(y_pred)

assert y_pred.shape == (100,), "Invalid data shape"

y_pred (100,)
[-2.52808123 -2.73060194 -2.66955418 -2.71576643 -2.56193499 -2.22947086
 -2.04363065 -1.76982229 -0.37267193  0.9266033   1.52843026  2.46263484
  2.97648304  3.19005771  2.93103462  2.54489195  1.53548829  1.5127329
  2.1845835   2.33201403  3.75041505  3.98009082  3.15427389  2.20402382
  2.69967618  4.3817948   5.75234016  7.79233235  8.59337578  9.2850982
  8.73157607  7.67758547  8.03912667  8.09531512  7.69546588  7.47845155
  7.18047493  7.48433393  8.16370692  7.603452    7.97696686  7.74454684
  7.78229465  7.83500889  7.46942541  7.97006217  7.22187428  7.48453291
  7.81676517  7.68819536  8.44763528  8.38401857  8.09730763  8.21630324
  8.55629813  9.37969214  8.1790876   8.07533615  7.09457336  6.69955939
  6.32452358  5.41918679  5.30384308  5.11353697  4.89559246  4.85831463
  4.7886502   4.54011     4.60350929  4.75552585  4.43496114  4.97558614
  4.97768873  4.90007911  6.37124295  6.302576    7.27101148  7.64634086
  7.9845162   7.96364045  8.49727621  8

# Cross-validation for selection of best Kernels

In [5]:
## DIFFERENT KERNELS TESTED

# MICE i = 10 | MICE i = 20 | KNN = 2 | KNN = 3 | MICE i = 10 HOE | MICE i = 20 HOE | KNN = 2 HOE | KNN = 3 HOE -->> accuracy on submission 
#kernel 1 = DotProduct                                                 - 0.999 | 0.833 | 0.827 | 0.831 | 0.999 | 0.999 |    -  |    -  ->> 0.079 
#kernel 2 = RBF                                                        - 0.874 | -4.95 | -4.89 | -4.93 | 0.876 | 0.876 | -4.87 | -4.89 ->> 0.656 
#kernel 3 = Matern                                                     - 0.907 | 0.949 | 0.946 | 0.948 | 0.895 | 0.895 | 0.953 | 0.951 ->> 0.780 
#kernel 4 = Rat. Quad.                                                 - 0.896 | 0.957 | 0.953 | 0.956 | 0.891 | 0.891 | 0.958 | 0.954 ->> 0.919
#kernel 5 = Sum(Matern(),WhiteKernel())                                - 0.911 | 0.970 | 0.969 | 0.973 | 0.909 | 0.909 | 0.971 | 0.971 ->>
# submission:                                                                          | 0.9830| 0.9831|               | 0.9746| 0.9676
#kernel 6 = Sum(RationalQuadratic(),WhiteKernel())                     - 0.906 | 0.974 | 0.964 | 0.969 | 0.909 | 0.909 | 0.970 | 0.970 ->>     
# submission:                                                                          | 0.9826| 0.9830|               | 0.9733| 0.9676
#kernel 7 = Sum(Product(ConstantKernel(1.0), RBF()), WhiteKernel())    - 0.999 | 0.977 | 0.970 | 0.976 | 0.999 | 0.999 | 0.960 | 0.966 ->>
# submission:                                                                          | 0.9867| 0.9862|               | 0.9770| 0.9719
#kernel 8 = Sum(Product(RationalQuadratic(), Matern()), WhiteKernel()) - 0.909 | 0.973 | 0.972 | 0.972 | 0.909 | 0.909 | 0.972 | 0.970 ->>
# submission:                                                                          | 0.9815| 0.9823|               | 0.9745| 0.9674
"""
Comments:
- risk of overfitting with product of kernel vs sum of kernels
- WhiteKernel add noise, helping with data that isn't smooth. Summing this kernel with any other improves its K-fold r2score.
- 0.999 scores are sketchy. Might submit one to see how it produces.
- OneHot Encoder doesn't seem to improve MICE r2 score. It improves some of the KNN r2 scores.
- kernels 5-8 seem most promising. Also important to observe each K-fold r2 score. Some have good overall score but one K-Fold r2 score < hard baseline
- for submission, either ker5,6,8. I don't trust 7.
- best so far: ker=5,6,8 for KNN = 3 (HOE optional)
"""
#gpr2 = GaussianProcessRegressor(kernel=Sum(Product(RationalQuadratic(), Matern()), WhiteKernel()))
# pipeline = make_pipeline(StandardScaler(), GaussianProcessRegressor(kernel=RBF()))

# Normalising data - unsure how to scale it back up after. Need to find way to rescale it for appropriate comparison to the submission.
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)


n_folds = 9
kf = KFold(n_folds)
score = 0
i = 0
for train_data, test_data in kf.split(X_train):
    i += 1
    
    X_train_KF, X_test_KF = X_train[train_data], X_train[test_data]
    y_train_KF, y_test_KF = y_train[train_data], y_train[test_data]
    
    gpr.fit(X_train_KF, y_train_KF)
    y_pred_KF, sigma = gpr.predict(X_test_KF, return_std=True)

    # pipeline.fit(X_train_KF, y_train_KF)
    # y_pred_KF = pipeline.predict(X_test_KF)

    r2 = r2_score(y_test_KF, y_pred_KF)
    score += r2
    print(" KF: {} | score: {}".format(i,r2))

score /= n_folds
print("P2 score is ", score)

 KF: 1 | score: 0.9908154213227873
 KF: 2 | score: 0.9639011494380465
 KF: 3 | score: 0.9891067249112022
 KF: 4 | score: 0.9606349178194932
 KF: 5 | score: 0.9724572862771002
 KF: 6 | score: 0.9757686107236504
 KF: 7 | score: 0.9821424166968546
 KF: 8 | score: 0.9411906073327535
 KF: 9 | score: 0.9842489492877704
P2 score is  0.9733628982010732


# Saving Results
You don't have to change this

In [6]:
dt = pd.DataFrame(y_pred) 
dt.columns = ['price_CHF']
dt.to_csv('results.csv', index=False)
print("\nResults file successfully generated!")


Results file successfully generated!
