# Project 0

## Import Libraries

In [15]:
import numpy as np
import pandas as pd

import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, train_test_split
from sklearn.cluster import KMeans
from sklearn.gaussian_process.kernels import Matern, RBF
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LinearRegression

import torch
from torch.nn import Module, Linear, Dropout
from torch.nn.functional import tanh, softmax, mse_loss, relu
from torch.optim import Adam, SGD

## Load Data

In [16]:
# load and split data
data_train = pd.read_csv('Data/train.csv', header=0, index_col=0)
data_test = pd.read_csv('Data/test.csv', header=0, index_col=0)

## Linear Regression

In [17]:
# preprocess data
x_train, y_train = data_train.iloc[:, 1:].to_numpy(), data_train.iloc[:, 0].to_numpy().reshape((-1, 1))
x_test = data_test.to_numpy()

# scaler_x, scaler_y = StandardScaler(), StandardScaler()
# x_train = scaler_x.fit_transform(x_train)
# y_train = scaler_y.fit_transform(y_train)
# x_test = scaler_x.transform(x_test)

In [20]:
# create model
class LR_Predictor(object):
    def __init__(self) -> None:
        self.regressor = LinearRegression()
    
    def train(self, x_train, y_train) -> None:
        self.regressor.fit(x_train, y_train)
    
    def predict(self, x_test) -> None:
        y_test = self.regressor.predict(x_test)
        return y_test

rp = LR_Predictor()
rp.train(x_train, y_train)
# y_test = scaler_y.inverse_transform(rp.predict(x_test).reshape((-1, 1)))
y_test = rp.predict(x_test).reshape((-1, 1))

y_test_df = pd.DataFrame(y_test, columns=["y"])
y_test_df["Id"] = data_test.index.tolist()
y_test_df = y_test_df[["Id", "y"]]
y_test_df.head()

Unnamed: 0,Id,y
0,10000,-66.002423
1,10001,451.406504
2,10002,-461.676417
3,10003,40.501209
4,10004,-126.744722


In [19]:
y_test_df.to_csv("./Result/result_1.csv", index=None)

## Ridge Regression

In [10]:
# preprocess data
x_train, y_train = data_train.iloc[:, 1:].to_numpy(), data_train.iloc[:, 0].to_numpy().reshape((-1, 1))
x_test = data_test.to_numpy()

# scaler_x, scaler_y = StandardScaler(), StandardScaler()
# x_train = scaler_x.fit_transform(x_train)
# y_train = scaler_y.fit_transform(y_train)
# x_test = scaler_x.transform(x_test)

In [11]:
# create model
class Ridge_Predictor(object):
    def __init__(self) -> None:
        self.regressor = KernelRidge(alpha=1.0, kernel="linear")
    
    def train(self, x_train, y_train) -> None:
        self.regressor.fit(x_train, y_train)
    
    def predict(self, x_test) -> None:
        y_test = self.regressor.predict(x_test)
        return y_test

rp = Ridge_Predictor()
rp.train(x_train, y_train)
# y_test = scaler_y.inverse_transform(rp.predict(x_test).reshape((-1, 1)))
y_test = rp.predict(x_test).reshape((-1, 1))

y_test_df = pd.DataFrame(y_test, columns=["y"])
y_test_df["Id"] = data_test.index.tolist()
y_test_df = y_test_df[["Id", "y"]]
y_test_df.head()

In [6]:
y_test_df.to_csv("./Result/result_1.csv", index=None)

## Gaussian Regression

In [10]:
# preprocess data
x_train, y_train = data_train.iloc[:, 1:].to_numpy(), data_train.iloc[:, 0].to_numpy().reshape((-1, 1))
x_test = data_test.to_numpy()

scaler_x, scaler_y = StandardScaler(), StandardScaler()
x_train = scaler_x.fit_transform(x_train)
y_train = scaler_y.fit_transform(y_train)
x_test = scaler_x.transform(x_test)

In [11]:
# create model
class GPR_Predictor(object):
    def __init__(self, n_clusters=20) -> None:
        self.n_clusters = n_clusters
        self.km = KMeans(n_clusters=self.n_clusters, random_state=0, n_init="auto")
        self.regressors = []
    
    def train(self, x_train, y_train) -> None:
        self.km.fit(x_train)
        for cluster_i in range(self.n_clusters):
            x_cluster = x_train[self.km.labels_ == cluster_i]
            y_cluster = y_train[self.km.labels_ == cluster_i].reshape((-1, 1))
            regressor = GaussianProcessRegressor(kernel=Matern(length_scale=1, nu=1.5), random_state=0)
            regressor.fit(x_cluster, y_cluster)
            self.regressors.append(regressor)
    
    def predict(self, x_test) -> None:
        x_test_labels = self.km.predict(x_test)
        x_ids = np.arange(x_test.shape[0]).reshape((-1, 1))

        y_clusters = []

        for cluster_i in range(self.n_clusters):
            x_cluster = x_test[x_test_labels == cluster_i]
            x_cluster_ids = x_ids[x_test_labels == cluster_i]
            y_cluster_mean, y_cluster_std = self.regressors[cluster_i].predict(x_cluster, return_std=True)
            y_cluster_mean = y_cluster_mean.reshape((-1, 1))
            y_cluster = np.concatenate([y_cluster_mean, x_cluster_ids], axis=1)
            y_clusters.append(y_cluster)

        y_test = np.concatenate(y_clusters, axis=0)
        y_test = y_test[np.argsort(y_test[:, 1])][:, 0]
        return y_test

gpr = GPR_Predictor()
gpr.train(x_train, y_train)
y_test = scaler_y.inverse_transform(gpr.predict(x_test).reshape((-1, 1)))
# y_test = gpr.predict(x_test).reshape((-1, 1))

y_test_df = pd.DataFrame(y_test, columns=["y"])
y_test_df["Id"] = data_test.index.tolist()
y_test_df = y_test_df[["Id", "y"]]
y_test_df.head()

Unnamed: 0,Id,y
0,10000,-66.138443
1,10001,452.2424
2,10002,-461.396098
3,10003,39.640058
4,10004,-126.675123


## PyTorch

In [3]:
# train
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

# preprocess data
x_train, y_train = data_train.iloc[:, 1:].to_numpy(), data_train.iloc[:, 0].to_numpy().reshape((-1, 1))
x_test = data_test.to_numpy()

scaler_x, scaler_y = StandardScaler(), StandardScaler()
x_train = scaler_x.fit_transform(x_train)
y_train = scaler_y.fit_transform(y_train)
x_test = scaler_x.transform(x_test)

x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, shuffle=True)

x_train = torch.from_numpy(x_train).to(device).float()
y_train = torch.from_numpy(y_train).to(device).float()
x_valid = torch.from_numpy(x_valid).to(device).float()
y_valid = torch.from_numpy(y_valid).to(device).float()
x_test = torch.from_numpy(x_test).to(device).float()

class NN_Model(Module):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.tanh = tanh
        self.nn1 = Linear(10, 256, device=device)
        self.dropout1 = Dropout(0.5)
        self.nn2 = Linear(256, 64, device=device)
        self.dropout2 = Dropout(0.5)
        self.nn3 = Linear(64, 1, device=device)
    
    def forward(self, x):
        output = self.tanh(self.nn1(x))
        # output = self.dropout1(output)
        output = self.tanh(self.nn2(output))
        # output = self.dropout2(output)
        output = self.nn3(output)
        return output

model = NN_Model()
optimizer = Adam(model.parameters(), lr=0.01)

for epoch in range(150):
    optimizer.zero_grad()
    output = model(x_train)
    loss = mse_loss(output, y_train)
    
    loss.backward()
    optimizer.step()

    # valid
    valid_loss = mse_loss(model(x_valid), y_valid)

    print("The epoch is {}, the loss is {:.03f}, the loss in validation data is {:.03f}".format(epoch, loss, valid_loss))

The epoch is 0, the loss is 1.069, the loss in validation data is 0.089
The epoch is 1, the loss is 0.092, the loss in validation data is 0.232
The epoch is 2, the loss is 0.227, the loss in validation data is 0.147
The epoch is 3, the loss is 0.145, the loss in validation data is 0.039
The epoch is 4, the loss is 0.041, the loss in validation data is 0.080
The epoch is 5, the loss is 0.082, the loss in validation data is 0.142
The epoch is 6, the loss is 0.145, the loss in validation data is 0.133
The epoch is 7, the loss is 0.136, the loss in validation data is 0.075
The epoch is 8, the loss is 0.079, the loss in validation data is 0.033
The epoch is 9, the loss is 0.037, the loss in validation data is 0.042
The epoch is 10, the loss is 0.044, the loss in validation data is 0.078
The epoch is 11, the loss is 0.079, the loss in validation data is 0.089
The epoch is 12, the loss is 0.090, the loss in validation data is 0.067
The epoch is 13, the loss is 0.068, the loss in validation da

In [4]:
y_test = scaler_y.inverse_transform(np.array(model(x_test).to("cpu").detach()))

y_test_df = pd.DataFrame(y_test, columns=["y"])
y_test_df["Id"] = data_test.index.tolist()
y_test_df = y_test_df[["Id", "y"]]
y_test_df.head()

Unnamed: 0,Id,y
0,10000,-65.592834
1,10001,460.452759
2,10002,-474.305145
3,10003,51.545864
4,10004,-126.654381


In [5]:
y_test_df.to_csv("./Result/result_1.csv", index=None)