In [None]:
%matplotlib inline

## Dataset download ##

### Import ###

In [None]:
%matplotlib inline


import hashlib
import os
import tarfile
import zipfile
import requests

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from torch import nn
from d2l import torch as d2l

### Download source config ###

In [None]:
DATA_HUB = dict()
DATA_URL = "http://d2l-data.s3-accelerate.amazonaws.com/"

### Download ###

In [None]:
def download(name, cache_dir=os.path.join(".", "data")):
    assert name in DATA_HUB, f"{name} is not existed in {DATA_HUB}"
    url, sha1_hash = DATA_HUB[name]
    os.makedirs(cache_dir, exist_ok=True)
    fname = os.path.join(cache_dir, url.split("/")[-1])
    if os.path.exists(fname):
        sha1 = hashlib.sha1()
        with open(fname, "rb") as f:
            while True:
                data = f.read(1048576)
                if not data:
                    break
                sha1.update(data)
        if sha1.hexdigest() == sha1_hash:
            return fname
    print(f"Downloading {fname} from {url}")
    r = requests.get(url, stream=True, verify=True)
    with open(fname, "wb") as f:
        f.write(r.content)
    return fname

### Add download source ###

In [None]:
DATA_HUB["kaggle_house_train"] = (  # @save
    DATA_URL + "kaggle_house_pred_train.csv",
    "585e9cc93e70b39160e7921475f9bcd7d31219ce",
)

DATA_HUB["kaggle_house_test"] = (  # @save
    DATA_URL + "kaggle_house_pred_test.csv",
    "fa19780a7b011d9b009e8bff8e99922a8ee2eb90",
)

## Data Preprocessing ##

### Load dataset ###

In [None]:
train_data = pd.read_csv(download("kaggle_house_train"))
test_data = pd.read_csv(download("kaggle_house_test"))

all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))

### Centering and fill NaN ###

In [None]:
numeric_features = all_features.dtypes[all_features.dtypes != "object"].index
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x: ((x - x.mean()) / x.std())
)
all_features[numeric_features] = all_features[numeric_features].fillna(0)

### One hot ###

In [None]:
all_features = pd.get_dummies(all_features, dummy_na=True)

### To Tensor ###

In [None]:
n_train = train_data.shape[0]
train_features = torch.tensor(
    all_features[:n_train].astype("float32").values, dtype=torch.float32
)
test_features = torch.tensor(
    all_features[n_train:].astype("float32").values, dtype=torch.float32
)
train_labels = torch.tensor(
    train_data["SalePrice"].astype("float32").values, dtype=torch.float32
).reshape(-1, 1)

## Model ##

### Hyper params ###

In [None]:
num_inputs = train_features.shape[-1]
lr = 0.5
wd = 0.5
k = 10
num_epochs = 200
batch_size = 64

### Net Define ###

In [None]:
net = nn.Sequential(nn.Linear(num_inputs, 1))

### Function Define ###

In [None]:
loss = nn.MSELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr, weight_decay=wd)

### Iter ###

In [None]:
def init_iter(data_arrays, batch_size, shuffle):
    temp = torch.utils.data.TensorDataset(*data_arrays)
    return torch.utils.data.DataLoader(temp, batch_size, shuffle)

### Log RMSE ###

In [None]:
def log_rmse(net, features, labels, loss=loss):
    temp = torch.clamp(net(features), 1, float("inf"))
    loss = torch.sqrt(loss(torch.log(temp), torch.log(labels)))
    return float(loss.item())

### Train ###

In [None]:
def train_once(net, train_features, train_labels, test_features, test_labels, optimizer, loss):
    train_iter = init_iter(
        (train_features, train_labels), batch_size=batch_size, shuffle=True
    )
    for X, y in train_iter:
        optimizer.zero_grad()
        l = loss(net(X), y)
        l.backward()
        optimizer.step()
    return log_rmse(net, test_features, test_labels, loss)


def get_k_fold_data(k, i, features, labels):
    assert k > 1
    lenth = len(features)
    assert lenth == len(labels)
    fold_size = lenth // k
    train_features = torch.cat(
        (features[: i * fold_size, :], features[(i + 1) * fold_size :, :]),
        dim=0,
    )
    train_labels = torch.cat(
        (labels[: i * fold_size, :], labels[(i + 1) * fold_size :, :]), dim=0
    )
    test_features = features[i * fold_size : (i + 1) * fold_size, :]
    test_labels = labels[i * fold_size : (i + 1) * fold_size, :]
    return train_features, train_labels, test_features, test_labels


def train(net, features, labels, optimizer, loss, k, num_epochs):
    loss_epoch = []
    print(f"Training in {k} folds:")
    for epoch in range(num_epochs):
        loss_k = []
        for i in range(k):
            train_features, train_labels, test_features, test_labels = get_k_fold_data(
                k, i, features, labels
            )
            loss_k.append(
                train_once(
                    net,
                    train_features,
                    train_labels,
                    test_features,
                    test_labels,
                    optimizer,
                    loss,
                )
            )
        loss_epoch.append(sum(loss_k) / len(loss_k))
        plt.clf()
        plt.plot(range(1, epoch + 2), loss_epoch, color="blue")
        plt.xlabel("epoch")
        plt.ylabel("log rmse")
        plt.show()
        print(f"Loss in epoch {epoch+1} is {loss_epoch[-1]}")
    return loss_epoch

## Main ##

In [None]:
def main():
    train(
        net, train_features, train_labels, optimizer, loss, k, num_epochs
    )


if __name__ == "__main__":
    main()

## Debug ##