In [40]:
import hashlib
import os
import tarfile
import zipfile
import requests
import numpy as np
import pandas as pd
import torch
from torch import nn
from d2l import torch as d2l

DATA_HUB = dict()
DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'

In [41]:
DATA_HUB['kaggle_house_train'] = (
    DATA_URL + 'kaggle_house_pred_train.csv',
    '585e9cc93e70b39160e7921475f9bcd7d31219ce')

DATA_HUB['kaggle_house_test'] = (
    DATA_URL + 'kaggle_house_pred_test.csv',
    'fa19780a7b011d9b009e8bff8e99922a8ee2eb90')

In [42]:
def download(name: str, cache_dir: str = os.path.join("..", "data")) -> str:
    assert name in DATA_HUB, f"{name} 不存在于 {DATA_HUB}"
    url, sha1_hash = DATA_HUB[name]
    os.makedirs(cache_dir, exist_ok=True)
    fname = os.path.join(cache_dir, url.split('/')[-1])
    if os.path.exists(fname):
        sha1 = hashlib.sha1()
        with open(fname, 'rb') as f:
            while True:
                data = f.read(1048576)
                if not data:
                    break
                sha1.update(data)
        if sha1.hexdigest() == sha1_hash:
            return fname
    print(f'Downloading {fname} from {url}')
    r = requests.get(url, stream=True, verify=True)
    with open(fname, 'wb') as f:
        f.write(r.content)
    return fname

In [43]:
def download_extract(name: str, folder=None) -> None:
    fname = download(name)
    base_dir: str = os.path.dirname(fname)
    data_dir, ext = os.path.splitext(name)
    if ext == '.zip':
        fp = zipfile.ZipFile(fname, 'r')
    elif ext in ('.tar', '.gz'):
        fp = tarfile.open(fname, 'r')
    else:
        assert False, 'Only zip/tar file can be unzip'
    fp.extractall(base_dir)
    return os.path.join(base_dir, folder) if folder else data_dir

def download_all() -> None:
    for name in DATA_HUB:
        download(name)

In [44]:
train_data: pd.DataFrame = pd.read_csv(download('kaggle_house_train'))
test_data: pd.DataFrame = pd.read_csv(download('kaggle_house_test'))

In [45]:
all_features: pd.DataFrame = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:])) # 上下堆叠

In [46]:
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index # 找到所有不包含nan的列，当一列的dtype是obejct，代表了这一行是混合类型，这里大概率就是nan了
all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / x.std()) # 归一化feature
all_features[numeric_features] = all_features[numeric_features].fillna(0) # 让nan填充0

In [47]:
all_features = pd.get_dummies(all_features, dummy_na=True) # 这里处理的是string了，上独热编码
all_features.shape

(2919, 331)

In [48]:
n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float32)
train_labels = torch.tensor(train_data["SalePrice"].values.reshape(-1, 1), dtype=torch.float32)

In [None]:
loss = nn.MSELoss()
in_features = train_features.shape[1]

def get_net(in_features: int) -> nn.Module:
    net = nn.Sequential(nn.Linear(in_features, 1))
    return net

In [None]:
def log_rmse(net, features, labels):
    clipped_preds = torch.clamp(net(features), 1, float('inf')) # 剪裁
    rmse = torch.sqrt(loss(torch.log(clipped_preds), torch.log(labels))) # 均方根误差
    return rmse.item()

In [None]:
def train(net: nn.Module, num_epochs, learning_rate, weight_decay, batch_size, train_features, train_labels, test_features, test_labels = None) -> tuple[list]:
    train_ls, test_ls = [], []
    train_iter = d2l.load_array((train_features, train_labels), batch_size) 
    optimizer: torch.optim.Adam = torch.optim.Adam(net.parameters(), lr=learning_rate, weight_decay=weight_decay)
    for epoch in range(0, num_epochs):
        for X, y in train_iter:
            optimizer.zero_grad()
            l: torch.Tensor = loss(net(X), y)
            l.backward()
            optimizer.step()
        train_ls.append(log_rmse(net, train_features, train_labels))
        if test_labels is not None:
            test_ls.append(log_rmse(net, test_features, test_labels))
        return train_ls, test_ls

