In [185]:
import torch
from torch.utils import data
from torch import nn
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math

train_data = pd.read_csv("../data/train.csv")
test_data = pd.read_csv("../data/test.csv")

def filter_missing(cols, data, rate):
    data = data.drop(columns=cols, axis=1)
    missing = data.isnull().sum()
    missing = missing[missing>int(data.shape[0]*rate)]
    data = data.drop(columns=dict(missing).keys(), axis=1)
    return data

def data_preprocess(train_data, test_data, rate=1, pca=False, pca_dim=100):
    train_labels = train_data["SalePrice"]
    # 剔除不需要的列, 例如Id, SalePrice以及Nan值过多的列
    train_features = filter_missing(["Id", "SalePrice"], train_data, rate)
    test_features = filter_missing(["Id"], test_data, rate)
    # 拼接起来统一处理, 包括数值特征均值归一化, 0补充Nan值, 分类特征one-hot编码
    all_features = pd.concat([train_features, test_features], axis=0)
    numeric = [f for f in all_features.columns if all_features.dtypes[f] != 'object']
    all_features[numeric] = all_features[numeric].apply(lambda x: (x-x.mean())/x.std())
    all_features[numeric] = all_features[numeric].fillna(0)
    all_features = pd.get_dummies(all_features, dummy_na=True)
    train_size = train_data.shape[0]
    # 进行PCA降维, 可选
    feature = all_features.values
    if(pca):
        feature, floss = k_pca(feature, pca_dim)
        print(f"feature loss is {floss:.5f}")
    train_X = torch.tensor(feature[:train_size, :], dtype=torch.float32)
    test_X = torch.tensor(feature[train_size:, :], dtype=torch.float32)
    train_y = torch.tensor(train_labels.values, dtype=torch.float32)
    return train_X, train_y, test_X

def k_pca(X, k):
    X = (X-X.mean())/X.std()
    X = np.matrix(X)
    Sigma = X.T*X/X.shape[0]
    U, S, V = np.linalg.svd(Sigma)
    new_f = X*np.matrix(U[:, :k])
    f_loss = float(1-S[:k].sum()/S[:].sum())
    return new_f, f_loss

def rmsle(y_hat, y):
    inner = torch.pow((torch.log(y_hat+1)-torch.log(y+1)), 2).sum()/len(y)
    return math.sqrt(inner)

# 以后初始化就固定这个模式xv_init+apply
def get_model(n_in, n_out):
    model = nn.Sequential(nn.Linear(n_in, n_out))
    def xv_init(m):
        if type(m) == nn.Linear:
            n_in = m.weight.data.shape[1]
            n_out = m.weight.data.shape[0]
            std = math.sqrt(2/(n_in+n_out))
            nn.init.normal_(m.weight, std=std)
    model.apply(xv_init)
    return model

# MSELoss
def get_loss():
    return nn.MSELoss()

# optim的使用Adam(相对于SGD的优化版), 默认L2惩罚
def get_optim(param, lr=0.001, punish=0):
    return torch.optim.Adam(param, lr=lr, weight_decay=punish)

def shuffle(X, y):
    y = y.reshape(-1, 1)
    data = torch.concat([X, y], dim=1)
    data = data[torch.randperm(data.size(0))] # 按行打乱
    return data[:, :-1], data[:, -1]

def split(X, y, i, K=10):
    """X[m, d], y[m], 将X, y分成K块, 第i块作为valid, 其余的作为train集"""
    y = y.reshape(-1, 1)
    data = torch.concat([X, y], dim=1)
    chunks = torch.chunk(data, K)
    valid_ = chunks[i]
    train_chunk_list = [chunks[j] for j in range(K) if j!= i]
    train_ = torch.concat(train_chunk_list, dim=0)
    return train_[:, :-1], train_[:, -1], valid_[:, :-1], valid_[:, -1]

def train(train_features, train_labels, lr=0.001, punish=0, m=10, K=10, batch_size=100):
    features_num = train_features.shape[1]
    model = get_model(features_num, 1)
    loss = get_loss()
    optim = get_optim(model.parameters(), lr=lr, punish=punish)
    val_rmsle, val_rmse = [], []
    # 进行m次k折训练
    for i in range(m):
        # 对 feature 和 label 打乱, 注意对应不变
        trf, trl = shuffle(train_features, train_labels)
        # 使用chunk()函数
        for j in range(K):
            train_X, train_y, val_X, val_y = split(trf, trl, j, K)
            train_y = train_y.reshape(-1,1)
            val_y = val_y.reshape(-1,1)
            # 构造train_dataset, 进行小批量随机梯度下降训练
            train_dataset = data.TensorDataset(train_X, train_y)
            train_iter = data.DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
            for X, y in train_iter:
                l = loss(model(X), y)
                optim.zero_grad()
                l.backward()
                optim.step()
            # 计算验证集的rmsle损失
            val_rmsle.append(float(rmsle(model(val_X), val_y)))
            val_rmse.append(float(loss(model(val_X), val_y)))
        if i%10 == 0:
            print(f"第{i+1}次{K}折训练后, 验证集rmsle = {val_rmsle[-1]:.5f}")
    return model, val_rmsle, val_rmse

In [195]:
# 数据预处理参数
rate = 0.3 # 丢弃缺失值数量比例超过rate的特征
pca = False # 是否进行PCA算法降维
pca_dim=250 # 降低的维数
# 读取和预处理数据
train_X, train_y, test_X = data_preprocess(
    train_data, 
    test_data, 
    rate=rate,
    pca=pca,
    pca_dim=pca_dim
)
print(train_X.shape, train_y.shape, test_X.shape)
torch.isnan(train_X).any(), torch.isinf(train_X).any()

torch.Size([1460, 308]) torch.Size([1460]) torch.Size([1459, 308])


(tensor(False), tensor(False))

In [203]:
# 训练参数
lr = 0.75  # Adam优化函数学习率
K = 12    # K-折交叉验证的K值
m = 120   # 进行K-折交叉验证的次数
batch_size = 200  # 小批量梯度下降的批量大小
# 训练模型
model, val_rmsle, val_rmse = train(
    train_X, 
    train_y, 
    lr=lr, 
    K=K, 
    m=m, 
    batch_size=batch_size
)
float(rmsle(model(train_X), train_y))

第1次12折训练后, 验证集rmsle = 4.26949
第11次12折训练后, 验证集rmsle = 1.92701
第21次12折训练后, 验证集rmsle = 1.26815
第31次12折训练后, 验证集rmsle = 0.92468
第41次12折训练后, 验证集rmsle = 0.71285
第51次12折训练后, 验证集rmsle = 0.47799
第61次12折训练后, 验证集rmsle = 0.34223
第71次12折训练后, 验证集rmsle = 0.27649
第81次12折训练后, 验证集rmsle = 0.19938
第91次12折训练后, 验证集rmsle = 0.16316
第101次12折训练后, 验证集rmsle = 0.19451
第111次12折训练后, 验证集rmsle = 0.13861


21.26569747071597

In [201]:
# 保存模型和实验结果
# 进行预测
import time
t = time.strftime('%Y_%m_%d_%H_%M_%S', time.localtime())
test_predict = model(test_X).detach()
test_Ids = torch.tensor(test_data["Id"].values, dtype=int)
test_Ids = test_Ids.reshape(-1, 1)
submmision = torch.concat([test_Ids, test_predict], dim=1).numpy()
submmision = pd.DataFrame(submmision, columns=["Id", "SalePrice"])
submmision["Id"] = submmision["Id"].astype(int)
submmision.to_csv(f"../output/{t}.csv", index=False)
model_name =  f"../model/dim{pca_dim}rate{rate}_lr{lr}_K{K}_m{m}_bs{batch_size}"
torch.save(model, model_name)