In [12]:
import pandas as pd
import torch

# 1. 读取训练数据 与 测试数据
def read_data(train_path, test_path):
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    # data构成了train和test的全部的数据
    data = pd.concat((train_df.iloc[:, 1:-1], test_df.iloc[:, 1:]))
    print("data.shape=", data.shape)
    # 删除缺失值超过1/3的特征, 仅以train_df
    missing = data.isnull().sum()
    missing = missing[missing > data.shape[0] // 3]
    mov_f = list(dict(missing).keys())
    data = data.drop(labels=mov_f, axis=1)
    print("after delete nan features, data.shape=", data.shape)
    # 对numeric对象进行均值归一化
    numeric = [num for num in data.columns if data.dtypes[num] != 'object']
    data[numeric] = data[numeric].apply(lambda x: (x-x.mean())/x.std())
    data[numeric] = data[numeric].fillna(0)
    # 对category特征进行onehot编码, 这将大大增加特征数量
    data = pd.get_dummies(data, dummy_na=True)
    print("after one-hot operation, data.shape=", data.shape)
    # 转为tensor后返回
    train_size = train_df.shape[0]
    train_features = torch.tensor(data[:train_size].values, dtype=torch.float32)
    test_features = torch.tensor(data[train_size:].values, dtype=torch.float32)
    train_labels = torch.tensor(train_df["SalePrice"].values.reshape(-1, 1), dtype=torch.float32)
    test_ID = torch.tensor(test_df["Id"].values.reshape(-1, 1), dtype=torch.int32)
    return train_features, train_labels, test_features, test_ID


In [13]:
train_features, train_labels, test_features, test_ID= read_data("../data/train.csv", "../data/test.csv")

data.shape= (2919, 79)
after delete nan features, data.shape= (2919, 74)
after one-hot operation, data.shape= (2919, 308)


In [14]:
train_features.shape, train_labels.shape, test_features.shape

(torch.Size([1460, 308]), torch.Size([1460, 1]), torch.Size([1459, 308]))

In [62]:
# 模型采用基本的线性模型
from torch import nn
from torch.utils import data
import math
# 定义和初始化模型
input_size = train_features.shape[1]
output_size = 1
def get_model():
    model = nn.Sequential(nn.Linear(input_size, output_size))
    delta = math.sqrt(2/(input_size+output_size))
    def init_param(m):
        if type(m) == nn.Linear:
            nn.init.normal_(m.weight, mean=0, std=delta)
    model.apply(init_param)
    return model
model = get_model()
# 数据加载器
batch_size = 100
train_iter = data.DataLoader(data.TensorDataset(train_features, train_labels), batch_size=batch_size, shuffle=True)

In [41]:
# 损失函数和优化函数
loss = nn.MSELoss()
optim = torch.optim.SGD(model.parameters(), lr=0.005)
# 训练过程
epoch_num = 5000
for epoch in range(epoch_num):
    for X, y in train_iter:
        l = loss(model(X), y)
        optim.zero_grad()
        l.backward()
        optim.step()
    if epoch % 500 == 0:
        val = loss(model(train_features), train_labels)
        print(f"in epoch {epoch}, loss = {val:.6f}")


in epoch 0, loss = 484838912.000000
in epoch 500, loss = 481690816.000000
in epoch 1000, loss = 481189056.000000
in epoch 1500, loss = 477411840.000000
in epoch 2000, loss = 475435008.000000
in epoch 2500, loss = 476512960.000000
in epoch 3000, loss = 472008128.000000
in epoch 3500, loss = 470398144.000000
in epoch 4000, loss = 468623712.000000
in epoch 4500, loss = 467889952.000000


In [42]:
test_labels = model(test_features)

In [59]:
test_prdict = torch.concat([test_ID, test_labels], dim=1)
test_prdict = test_prdict.detach().numpy()
test_prdict = pd.DataFrame(columns=["Id", "SalePrice"], data=test_prdict)
test_prdict["Id"] = test_prdict["Id"].astype(int)
test_prdict["Id"].dtype, test_prdict.head()

(dtype('int32'),
      Id      SalePrice
 0  1461  109414.078125
 1  1462  147857.984375
 2  1463  182682.906250
 3  1464  192471.359375
 4  1465  207182.656250)

In [60]:
test_prdict.to_csv("../output/linear_sgd_001.csv", index=False)

以上实际上使用的技巧就是很基本的，唯一的trick在于直接舍弃了一部分无关的feature, 在模型基本结构和损失函数以及优化算法上并没有什么特别的，在训练时也是采取基本的小批量随机梯度下降算法，采取jiao'xikaggle给出的结论是2817/3984，score=0.18530
使用RMSE评分.
我看到, 在教程中, 对数据的预处理是：缺失值视为特征的一部分，统统归一化、one-hot编码，也就是在特征选取上并没有什么特别的
在模型选择上，基本结构是线性模型
在损失函数上，加入了RMSE评估
在训练法则上采取了K-折交叉验证方法