In [62]:
%matplotlib inline
import pandas as pd
import torch
import numpy as np
from torch import nn
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, TensorDataset

train_data = pd.read_csv("/Users/WAIT/Projects/house_prices_regression/data/house-prices-advanced-regression-techniques/train.csv")
test_data = pd.read_csv("/Users/WAIT/Projects/house_prices_regression/data/house-prices-advanced-regression-techniques/test.csv")

In [63]:
features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))

In [65]:
#数据预处理
numeric_features_index = features.dtypes[features.dtypes != 'object'].index
features[numeric_features_index] = features[numeric_features_index].apply(
    lambda x: (x - x.mean()) / (x.std()))
features[numeric_features_index] = features[numeric_features_index].fillna(0)
features = pd.get_dummies(features, dummy_na=True)

n_train = train_data.shape[0]
train_features = torch.tensor(features[:n_train].values, dtype=torch.float32)
test_features = torch.tensor(features[n_train:].values, dtype=torch.float32)
train_labels = torch.tensor(
    train_data.SalePrice.values.reshape(-1, 1), dtype=torch.float32)

In [66]:
#模型构建以及损失函数
loss = nn.MSELoss()
in_features = train_features.shape[1]

def get_net():
    net = nn.Sequential(
        nn.Linear(in_features,1))
    return net

def log_rmse(net, features, labels):
    clipped_preds = torch.clamp(net(features), 1, float('inf'))
    rmse = torch.sqrt(loss(torch.log(clipped_preds),
                           torch.log(labels)))
    return rmse.item()

In [89]:
#模型训练
def train(net, train_features, train_labels, 
          num_epochs, learning_rate, weight_decay, batch_size):
    train_ls, test_ls = [], []
    train_dataset = TensorDataset(train_features, train_labels)
    train_iter = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    optimizer = torch.optim.Adam(net.parameters(),
                                 lr = learning_rate,
                                 weight_decay = weight_decay)
    for epoch in range(num_epochs):
        for X, y in train_iter:
            optimizer.zero_grad()
            l = loss(net(X), y)
            l.backward()
            optimizer.step()
        train_ls.append(log_rmse(net, train_features, train_labels))
    return train_ls, test_ls

In [95]:
train_losses, trained_model = train(
    get_net(), train_features, train_labels, 
    num_epochs=100, learning_rate=5, weight_decay=0, batch_size=64
)
print(train(
    get_net(), train_features, train_labels, 
    num_epochs=100, learning_rate=5, weight_decay=0, batch_size=64
))
print(f"最终训练的RMSE: {train_losses[-1]:.4f}")

([8.853338241577148, 8.66311264038086, 8.563430786132812, 8.506301879882812, 8.462475776672363, 8.434672355651855, 8.41757583618164, 8.408380508422852, 8.390048027038574, 8.390693664550781, 8.376922607421875, 8.36920166015625, 8.358766555786133, 8.352455139160156, 8.351505279541016, 8.353009223937988, 8.350590705871582, 8.34039306640625, 8.342693328857422, 8.33510971069336, 8.330342292785645, 8.330846786499023, 8.327282905578613, 8.329651832580566, 8.319137573242188, 8.308155059814453, 8.306154251098633, 8.309293746948242, 8.298196792602539, 8.303515434265137, 8.299491882324219, 8.307718276977539, 8.304859161376953, 8.306768417358398, 8.303330421447754, 8.301942825317383, 8.296956062316895, 8.295376777648926, 8.28995132446289, 8.283586502075195, 8.276979446411133, 8.268641471862793, 8.25793743133545, 8.264036178588867, 8.260812759399414, 8.256399154663086, 8.245905876159668, 8.248190879821777, 8.234000205993652, 8.22231674194336, 8.207897186279297, 8.203312873840332, 8.198394775390625,