In [1]:
# 下载数据集
!kaggle competitions download -c house-prices-advanced-regression-techniques

HTTPSConnectionPool(host='storage.googleapis.com', port=443): Max retries exceeded with url: /kaggle-competitions-data/kaggle-v2/5407/868283/bundle/archive.zip?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1765562150&Signature=Dc7Iv3rP%2FD99rbQvKqFevVymu2xU9FzdOTTMCV8PNsP8ygqW8JsTnYhl%2Fsza2u7Gj7dn7O8D557i2zEfoLeaR7foQf79Hd3kKegl6aHX%2FOsyvnGoyt7HiX%2F%2FAUTX3Ys%2Bas9zUxDO8GpSM3LcIHQnlVkMCCYeza3iT66e%2BJ86iuAaklfrBcbQaTW5k4oEJNIGKXEv4n3ZNxqqZb7wjT9%2B8QebUt4yl33D2%2B2zZZ57458dWmd%2F%2BZ59p9LgaT%2FKTZ2JNKVLTViVw%2FS5q7C4%2FQx5SFLk5K1T7arX4zylTmwVyuFENmi2GYVcppwSAar%2BPlBTk54cN1zbiIP8shnX3%2BQ1Hg%3D%3D&response-content-disposition=attachment%3B+filename%3Dhouse-prices-advanced-regression-techniques.zip (Caused by SSLError(SSLZeroReturnError(6, 'TLS/SSL connection has been closed (EOF) (_ssl.c:1147)')))


In [2]:
import zipfile
import os

In [3]:
# 解压数据
path = "house-prices-advanced-regression-techniques"
if not os.path.exists(path):
    os.makedirs(path)
base_dir = os.path.dirname(path)
fp = zipfile.ZipFile("house-prices-advanced-regression-techniques.zip", 'r')
fp.extractall(path)

In [4]:
# 读取数据
import pandas as pd
train_data = pd.read_csv(f"{path}/train.csv")
test_data = pd.read_csv(f"{path}/test.csv")
print(train_data.shape)
print(test_data.shape)
# DataFrame.iloc基于整数位置（而非列命来选择数据，使用行号和列号定位）
print(train_data.iloc[0:4, [0, 1, 2, 3, -3, -2, -1]])

(1460, 81)
(1459, 80)
   Id  MSSubClass MSZoning  LotFrontage SaleType SaleCondition  SalePrice
0   1          60       RL         65.0       WD        Normal     208500
1   2          20       RL         80.0       WD        Normal     181500
2   3          60       RL         68.0       WD        Normal     223500
3   4          70       RL         60.0       WD       Abnorml     140000


In [5]:
# 数据预处理-连续值
# 1.移除对训练无意义的ID，拼接训练和测试csv数据为一个完整数据集，注意iloc中的-1不包含-1位置（最后一列）1:-1和1:的列数相同
all_features = pd.concat((train_data.iloc[:,1:-1], test_data.iloc[:, 1:]))
# 2.将所有缺失值替换为相应特征的均值
# 3.归一化（将所有特征放在一个共同的尺度上，将特征重新缩放到零均值和单位方差来标准化数据） x（特征）-μ（均值）/σ（标准差）
#   归一化后方便优化，避免数据尺度差异导致的梯度爆炸或消失等问题、同时因为不确定特征之间的相关性，不能随意分配惩罚给一个特征的系数比其他的特征大
numeric_features_index = all_features.dtypes[all_features.dtypes != 'object'].index # 取出所有数字类型的特征的列index
# lambda可创建一个临时函数，lambda param_{1},...param_{n}: expression
# apply函数用于对当前数据集中的每一个元素应用一个函数，实现批量加工
all_features[numeric_features_index] = all_features[numeric_features_index].apply(lambda x:(x-x.mean()) / (x.std()))
# 在标准化后，所有均值消失，可将缺失值设置为0（标准化的本质是中心化，将分布的均值设置为0）
all_features[numeric_features_index] = all_features[numeric_features_index].fillna(0)

# 数据预处理-离散值：使用独热编码处理
# get_dummies将离散值转换为独热编码表示，对每一列中的每一个离散值，都将其作为新的一列，并使用独热编码（0或1）设置其值为数值，dummy_na可将na（缺失值）也视为有效的特征值，并为其创建指示符特征
all_features = pd.get_dummies(all_features, dummy_na=True, dtype=int)
all_features.shape
all_features.dtypes

MSSubClass               float64
LotFrontage              float64
LotArea                  float64
OverallQual              float64
OverallCond              float64
                          ...   
SaleCondition_Alloca       int64
SaleCondition_Family       int64
SaleCondition_Normal       int64
SaleCondition_Partial      int64
SaleCondition_nan          int64
Length: 330, dtype: object

In [None]:
import torch
# 转换为张量
# 从pandas格式中提取NumPy格式，并将其转换为张量表示用于训练
n_train = train_data.shape[0]
# 从拼接的全部特征中取出测试集部分
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)
test_features  = torch.tensor(all_features[n_train:].values, dtype=torch.float32)
# 取出labels
train_labels = torch.tensor(train_data.SalePrice.values.reshape(-1, 1), dtype=torch.float32)

In [None]:
from torch import nn
# 基线模型
loss = nn.MSELoss()
in_features = train_features.shape[1]

def get_net() -> nn.Module:
    """基线模型 返回一个标量输出

    Returns:
        nn.Module: PyTorch Module
    """
    net = nn.Sequential(nn.Linear(in_features, 1))
    return net

In [None]:
def log_rmse(net: nn.Module, features: torch.Tensor, labels: torch.Tensor):
    clipped_preds = torch.clamp(net(features), 1, float('inf'))
    rmse = torch.sqrt(loss(torch.log(clipped_preds), torch.log(labels)))
    return rmse.item()