# Project 0

## Import Libraries

In [93]:
import numpy as np
import pandas as pd

import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.cluster import KMeans
from sklearn.gaussian_process.kernels import Matern, RBF
from sklearn.gaussian_process import GaussianProcessRegressor

import torch

## Load Data

In [94]:
# load and split data
data_train = pd.read_csv('Data/train.csv', header=0, index_col=0)
data_test = pd.read_csv('Data/test.csv', header=0, index_col=0)

x_train, y_train = data_train.iloc[:, 1:].to_numpy(), data_train.iloc[:, 0].to_numpy().reshape((-1, 1))
x_test = data_test.to_numpy()

## Gaussian Regression

In [95]:
scaler_x, scaler_y = StandardScaler(), StandardScaler()
x_train = scaler_x.fit_transform(x_train)
y_train = scaler_y.fit_transform(y_train)
x_test = scaler_x.transform(x_test)

In [107]:
# create model
class GPR_Predictor(object):
    def __init__(self, n_clusters=20) -> None:
        self.n_clusters = n_clusters
        self.km = KMeans(n_clusters=self.n_clusters, random_state=0, n_init="auto")
        self.regressors = []
    
    def train(self, x_train, y_train) -> None:
        self.km.fit(x_train)
        for cluster_i in range(self.n_clusters):
            x_cluster = x_train[self.km.labels_ == cluster_i]
            y_cluster = y_train[self.km.labels_ == cluster_i].reshape((-1, 1))
            regressor = GaussianProcessRegressor(kernel=Matern(length_scale=1, nu=1.5), random_state=0)
            regressor.fit(x_cluster, y_cluster)
            self.regressors.append(regressor)
    
    def predict(self, x_test) -> None:
        x_test_labels = self.km.predict(x_test)
        x_ids = np.arange(x_test.shape[0]).reshape((-1, 1))

        y_clusters = []

        for cluster_i in range(self.n_clusters):
            x_cluster = x_test[x_test_labels == cluster_i]
            x_cluster_ids = x_ids[x_test_labels == cluster_i]
            y_cluster_mean, y_cluster_std = self.regressors[cluster_i].predict(x_cluster, return_std=True)
            y_cluster_mean = y_cluster_mean.reshape((-1, 1))
            y_cluster = np.concatenate([y_cluster_mean, x_cluster_ids], axis=1)
            y_clusters.append(y_cluster)

        y_test = np.concatenate(y_clusters, axis=0)
        y_test = y_test[np.argsort(y_test[:, 1])][:, 0]
        return y_test

gpr = GPR_Predictor()
gpr.train(x_train, y_train)
y_test = scaler_y.inverse_transform(gpr.predict(x_test).reshape((-1, 1)))

y_test_df = pd.DataFrame(y_test, columns=["y"])
y_test_df["Id"] = data_test.index.tolist()
y_test_df = y_test_df[["Id", "y"]]
y_test_df.head()

Unnamed: 0,Id,y
0,10000,-66.138443
1,10001,452.2424
2,10002,-461.396098
3,10003,39.640058
4,10004,-126.675123


## PyTorch

In [109]:
# train
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

x_train = torch.tensor(data_train.iloc[:, 1:].to_numpy())
y_train = torch.tensor(data_train.iloc[:, 0].to_numpy()).reshape((-1, 1))

torch.

device(type='cuda')