## 1.使用pandas 读取数据

In [3]:
%matplotlib inline
import numpy as np
import pandas as pd
import torch
from torch import nn

# 读取训练数据和测试数据
train_data = pd.read_csv('./house-prices-advanced-regression-techniques/train.csv')
test_data = pd.read_csv('./house-prices-advanced-regression-techniques/test.csv')

print(train_data.shape)
print(test_data.shape)

(1460, 81)
(1459, 80)


## 2.查看前四个最后两个特征，以及相应的房价

In [21]:
# 0：4： 0-3行
# 对应的列：0-3列，-3- -1列
print(train_data.iloc[0:10, [0, 1, 2, 3, -3, -2, -1]])

   Id  MSSubClass MSZoning  LotFrontage SaleType SaleCondition  SalePrice
0   1          60       RL         65.0       WD        Normal     208500
1   2          20       RL         80.0       WD        Normal     181500
2   3          60       RL         68.0       WD        Normal     223500
3   4          70       RL         60.0       WD       Abnorml     140000
4   5          60       RL         84.0       WD        Normal     250000
5   6          50       RL         85.0       WD        Normal     143000
6   7          20       RL         75.0       WD        Normal     307000
7   8          60       RL          NaN       WD        Normal     200000
8   9          50       RM         51.0       WD       Abnorml     129900
9  10         190       RL         50.0       WD        Normal     118000


## 3.第一个特征是id,先删除

In [22]:
# [:, 1:-1]: 这个切片操作选择了train_data DataFrame除了第一列和最后一列之外的所有列。
# [:, 1:]: 这个切片操作选择了test_data DataFrame除了第一列之外的所有列。
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))
all_features.iloc[0:10, [0, 1, 2, 3, -3, -2, -1]]

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,2008,WD,Normal
1,20,RL,80.0,9600,2007,WD,Normal
2,60,RL,68.0,11250,2008,WD,Normal
3,70,RL,60.0,9550,2006,WD,Abnorml
4,60,RL,84.0,14260,2008,WD,Normal
5,50,RL,85.0,14115,2009,WD,Normal
6,20,RL,75.0,10084,2007,WD,Normal
7,60,RL,,10382,2009,WD,Normal
8,50,RM,51.0,6120,2008,WD,Abnorml
9,190,RL,50.0,7420,2008,WD,Normal


## 4.将所有缺失的值替换为相应特征的平均值。通过特征重新缩放到零均值和单位方差来标准化数据

In [23]:
# 作用：从all_features DataFrame中筛选出所有的数值型特征列名
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
print(numeric_features)
print(len(numeric_features))

# 这一步将numeric_features中的每一列都进行了标准化，即计算每一列的z-score，确保每一列的均值为0，标准差为1
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x: (x - x.mean()) / (x.std())
)
# 这一步则是将numeric_features中任何缺失的值（NaN）替换为0。
all_features[numeric_features] = all_features[numeric_features].fillna(0)

all_features.iloc[0:10, [0, 1, 2, 3, -3, -2, -1]]

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'],
      dtype='object')
36


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,YrSold,SaleType,SaleCondition
0,0.06732,RL,-0.184443,-0.217841,0.157619,WD,Normal
1,-0.873466,RL,0.458096,-0.072032,-0.602858,WD,Normal
2,0.06732,RL,-0.055935,0.137173,0.157619,WD,Normal
3,0.302516,RL,-0.398622,-0.078371,-1.363335,WD,Abnorml
4,0.06732,RL,0.629439,0.518814,0.157619,WD,Normal
5,-0.167877,RL,0.672275,0.50043,0.918095,WD,Normal
6,-0.873466,RL,0.243916,-0.010665,-0.602858,WD,Normal
7,0.06732,RL,0.0,0.027119,0.918095,WD,Normal
8,-0.167877,RM,-0.784145,-0.513264,0.157619,WD,Abnorml
9,3.124875,RL,-0.826981,-0.348436,0.157619,WD,Normal


## 5.处理离散值，再一次用独立热编码替换。

In [27]:
all_features = pd.get_dummies(all_features,dummy_na=True)*1
print(all_features.shape)

all_features.iloc[0:10, [0, 1, 2, 3, -3, -2, -1]]

(2919, 330)


Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,SaleCondition_Normal,SaleCondition_Partial,SaleCondition_nan
0,0.06732,-0.184443,-0.217841,0.646073,1,0,0
1,-0.873466,0.458096,-0.072032,-0.063174,1,0,0
2,0.06732,-0.055935,0.137173,0.646073,1,0,0
3,0.302516,-0.398622,-0.078371,0.646073,0,0,0
4,0.06732,0.629439,0.518814,1.355319,1,0,0
5,-0.167877,0.672275,0.50043,-0.77242,1,0,0
6,-0.873466,0.243916,-0.010665,1.355319,1,0,0
7,0.06732,0.0,0.027119,0.646073,1,0,0
8,-0.167877,-0.784145,-0.513264,0.646073,0,0,0
9,3.124875,-0.826981,-0.348436,-0.77242,1,0,0


## 6.从pandas格式中提取numpy格式，并将其转换为张量表示

In [29]:
n_train = train_data.shape[0]
print(n_train)

train_features = torch.tensor(all_features[:n_train].values,dtype=torch.float32)
test_features = torch.tensor(all_features[n_train:].values,dtype=torch.float32)

train_labels = torch.tensor(train_data.SalePrice.values.reshape((-1,1)),dtype=torch.float32)


1460


## 7.训练

In [32]:
loss = nn.MSELoss()
in_features = train_features.shape[1]

def get_net():
    net = nn.Sequential(nn.Linear(in_features,1))
    return net

## 8.误差处理

In [33]:
def log_rmse(net,features,labels):
    # 先使用模型net对features进行预测，得到预测值。然后，使用torch.max()函数将预测值与1.0进行比较，取两者中的较大值。
    clipped_preds = torch.max(net(features),torch.tensor(1.0))
    rmse = torch.sqrt(loss(torch.log(clipped_preds),torch.log(labels)))
    return rmse.item()

## 9.训练模型