## Dataset download ##

### Import ###

In [142]:
%load_ext jupyter_black
%matplotlib inline


import hashlib
import os
import tarfile
import zipfile
import requests


import numpy as np
import pandas as pd
import torch
from torch import nn
from d2l import torch as d2l

The jupyter_black extension is already loaded. To reload it, use:
  %reload_ext jupyter_black


### Download source config ###

In [143]:
DATA_HUB = dict()
DATA_URL = "http://d2l-data.s3-accelerate.amazonaws.com/"

### Download ###

In [144]:
def download(name, cache_dir=os.path.join(".", "data")):
    assert name in DATA_HUB, f"{name} is not existed in {DATA_HUB}"
    url, sha1_hash = DATA_HUB[name]
    os.makedirs(cache_dir, exist_ok=True)
    fname = os.path.join(cache_dir, url.split("/")[-1])
    if os.path.exists(fname):
        sha1 = hashlib.sha1()
        with open(fname, "rb") as f:
            while True:
                data = f.read(1048576)
                if not data:
                    break
                sha1.update(data)
        if sha1.hexdigest() == sha1_hash:
            return fname
    print(f"Downloading {fname} from {url}")
    r = requests.get(url, stream=True, verify=True)
    with open(fname, "wb") as f:
        f.write(r.content)
    return fname

### Add download source ###

In [145]:
DATA_HUB["kaggle_house_train"] = (  # @save
    DATA_URL + "kaggle_house_pred_train.csv",
    "585e9cc93e70b39160e7921475f9bcd7d31219ce",
)

DATA_HUB["kaggle_house_test"] = (  # @save
    DATA_URL + "kaggle_house_pred_test.csv",
    "fa19780a7b011d9b009e8bff8e99922a8ee2eb90",
)

## Data Preprocessing ##

### Load dataset ###

In [146]:
train_data = pd.read_csv(download("kaggle_house_train"))
test_data = pd.read_csv(download("kaggle_house_test"))

all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))

### Centering and fill NaN ###

In [147]:
numeric_features = all_features.dtypes[all_features.dtypes != "object"].index
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x: ((x - x.mean()) / x.std())
)
all_features[numeric_features] = all_features[numeric_features].fillna(0)

### One hot ###

In [148]:
all_features = pd.get_dummies(all_features, dummy_na=True)

### To Tensor ###

In [149]:
n_train = train_data.shape[0]
train_features = torch.tensor(
    all_features[:n_train].astype("float32").values, dtype=torch.float32
)
test_features = torch.tensor(
    all_features[n_train:].astype("float32").values, dtype=torch.float32
)
train_labels = torch.tensor(
    train_data["SalePrice"].astype("float32").values, dtype=torch.float32
).reshape(-1, 1)

## Model ##

### Hyper params ###

In [150]:
num_inputs = train_features.shape[-1]
lr = 0.2
wd = 0.5
k = 10
num_epochs = 200
batch_size = 64

### Net Define ###

In [151]:
net = nn.Sequential(nn.Linear(num_inputs, 1))

### Function Define ###

In [152]:
loss = nn.MSELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr, weight_decay=wd)

### Iter ###

In [153]:
def init_iter(data_arrays, batch_size, shuffle):
    temp = torch.utils.data.TensorDataset(*data_arrays)
    return torch.utils.data.DataLoader(temp, batch_size, shuffle)

### Log RMSE ###

In [154]:
def log_rmse(net, features, labels, loss=loss):
    temp = torch.clamp(net(features), 1, float("inf"))
    loss = torch.sqrt(loss(torch.log(temp), torch.log(labels)))
    return float(loss.item())

### Train ###

In [155]:
def train_1_round(
    net, train_features, train_labels, test_features, test_labels, optimizer, loss
):
    train_iter = init_iter(
        (train_features, train_labels), batch_size=batch_size, shuffle=True
    )
    for epoch in range(num_epochs):
        for X, y in train_iter:
            optimizer.zero_grad()
            l = loss(net(X), y)
            l.backward()
            optimizer.step()
        test_loss = log_rmse(net, test_features, test_labels)
        ###print(f"      Loss(Log RMSE) in test dataset in {epoch+1}th round is {test_loss}")
    return test_loss


def get_k_fold_data(k, i, features, labels):
    assert k > 1
    lenth = len(features)
    assert lenth == len(labels)
    fold_size = lenth // k
    train_features = torch.cat(
        (features[: i * fold_size, :], features[(i + 1) * fold_size :, :]),
        dim=0,
    )
    train_labels = torch.cat(
        (labels[: i * fold_size, :], labels[(i + 1) * fold_size :, :]), dim=0
    )
    test_features = features[i * fold_size : (i + 1) * fold_size, :]
    test_labels = labels[i * fold_size : (i + 1) * fold_size, :]
    return train_features, train_labels, test_features, test_labels


def train(net, features, labels, optimizer, loss, k):
    print(f"Training in {k} folds:")
    for i in range(k):
        train_features, train_labels, test_features, test_labels = get_k_fold_data(
            k, i, features, labels
        )
        test_loss = train_1_round(
            net,
            train_features,
            train_labels,
            test_features,
            test_labels,
            optimizer,
            loss,
        )
        print(f"Loss(Log RMSE) in test dataset in {i+1}st fold is {test_loss}")
    return test_loss

## Main ##

In [None]:
def main():
    for i in range(5):
        print(
            f"Loss in big round{i+1} is {train(net,train_features,train_labels,optimizer,loss,k)}"
        )


if __name__ == "__main__":
    main()

Training in 10 folds:
Loss(Log RMSE) in test dataset in 1st fold is 1.5896515846252441
Loss(Log RMSE) in test dataset in 2st fold is 0.9664998054504395
Loss(Log RMSE) in test dataset in 3st fold is 0.5657472014427185
Loss(Log RMSE) in test dataset in 4st fold is 0.35001060366630554
Loss(Log RMSE) in test dataset in 5st fold is 0.24345116317272186
Loss(Log RMSE) in test dataset in 6st fold is 0.15192225575447083
Loss(Log RMSE) in test dataset in 7st fold is 0.15367907285690308
Loss(Log RMSE) in test dataset in 8st fold is 0.16232427954673767
Loss(Log RMSE) in test dataset in 9st fold is 0.20681971311569214
Loss(Log RMSE) in test dataset in 10st fold is 0.15162977576255798
Loss in big round1 is 0.15162977576255798
Training in 10 folds:
Loss(Log RMSE) in test dataset in 1st fold is 0.1620161086320877
Loss(Log RMSE) in test dataset in 2st fold is 0.13257470726966858
Loss(Log RMSE) in test dataset in 3st fold is 0.15466350317001343
Loss(Log RMSE) in test dataset in 4st fold is 0.89096856117

KeyboardInterrupt: 

## Debug ##

True