## 实战Kaggle比赛：房价预测

In [2]:
%matplotlib inline
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import sys
sys.path.append("..")
import d2lzh_pytorch as d2l

torch.set_default_tensor_type(torch.FloatTensor)

**获取和读取数据集**

将数据从比赛中下载下来并用pandas读取

In [4]:
train_data = pd.read_csv('../../Datasets/kaggle_house/train.csv')
test_data = pd.read_csv('../../Datasets/kaggle_house/test.csv')

In [5]:
train_data.shape

(1460, 81)

In [6]:
test_data.shape

(1459, 80)

查看前四个样本的前$4$个特征、后$2$个特征和标签：

In [7]:
train_data.iloc[0:4, [0,1,2,3,-3,-2,-1]]

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,WD,Normal,208500
1,2,20,RL,80.0,WD,Normal,181500
2,3,60,RL,68.0,WD,Normal,223500
3,4,70,RL,60.0,WD,Abnorml,140000


将训练集和测试集不要$id$那一列按$79$个特征合并

In [11]:
all_features = pd.concat((train_data.iloc[:,1:-1],test_data.iloc[:,1:]))
all_features

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,6,2006,WD,Normal
1455,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2006,WD,Abnorml
1456,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,9,2006,WD,Abnorml
1457,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


**预处理数据**

对连续数据值得特征值做标准化$（standardization）$：将特征值得每个值先减去均值$\mu$再除以标准差$\sigma$得到标准化后得每个特征值。缺失的特征值用均值替代。

In [14]:
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x:(x-x.mean())/(x.std()))
# 标准化时，缺失值填充得时均值，当减去均值后确实值变为0.
# 所以缺失值直接在最后填0即可
all_features[numeric_features] = all_features[numeric_features].fillna(0)
all_features

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,0.067320,RL,-0.202033,-0.217841,Pave,,Reg,Lvl,AllPub,Inside,...,-0.285886,-0.063139,,,,-0.089577,-1.551918,0.157619,WD,Normal
1,-0.873466,RL,0.501785,-0.072032,Pave,,Reg,Lvl,AllPub,FR2,...,-0.285886,-0.063139,,,,-0.089577,-0.446848,-0.602858,WD,Normal
2,0.067320,RL,-0.061269,0.137173,Pave,,IR1,Lvl,AllPub,Inside,...,-0.285886,-0.063139,,,,-0.089577,1.026577,0.157619,WD,Normal
3,0.302516,RL,-0.436639,-0.078371,Pave,,IR1,Lvl,AllPub,Corner,...,-0.285886,-0.063139,,,,-0.089577,-1.551918,-1.363335,WD,Abnorml
4,0.067320,RL,0.689469,0.518814,Pave,,IR1,Lvl,AllPub,FR2,...,-0.285886,-0.063139,,,,-0.089577,2.131647,0.157619,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2.419286,RM,-2.266564,-1.043758,Pave,,Reg,Lvl,AllPub,Inside,...,-0.285886,-0.063139,,,,-0.089577,-0.078492,-1.363335,WD,Normal
1455,2.419286,RM,-2.266564,-1.049083,Pave,,Reg,Lvl,AllPub,Inside,...,-0.285886,-0.063139,,,,-0.089577,-0.815205,-1.363335,WD,Abnorml
1456,-0.873466,RL,4.255477,1.246594,Pave,,Reg,Lvl,AllPub,Inside,...,-0.285886,-0.063139,,,,-0.089577,1.026577,-1.363335,WD,Abnorml
1457,0.655311,RL,-0.342796,0.034599,Pave,,Reg,Lvl,AllPub,Inside,...,-0.285886,-0.063139,,MnPrv,Shed,1.144116,0.289865,-1.363335,WD,Normal


接下来将离散数值转成指示特征。举个例子，假设特征$MSZoning$里面有两个不同的离散值$RL$和$RM$，那么这一步转换将去掉$MSZoning$特征，并新加两个特征$MSZoning_RL$和$MSZoning_RM$，其值为$0$或$1$。如果一个样本原来在$MSZoning$里的值为$RL$，那么有$MSZoning_RL=1$且$MSZoning_RM=0$。

In [15]:
# dummy_na=True将缺失值也当作合法的特征值并为其创建指示特征
all_features = pd.get_dummies(all_features, dummy_na=True)
all_features

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_Oth,SaleType_WD,SaleType_nan,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SaleCondition_nan
0,0.067320,-0.202033,-0.217841,0.646073,-0.507197,1.046078,0.896679,0.525112,0.580807,-0.29308,...,0,1,0,0,0,0,0,1,0,0
1,-0.873466,0.501785,-0.072032,-0.063174,2.187904,0.154737,-0.395536,-0.572152,1.177910,-0.29308,...,0,1,0,0,0,0,0,1,0,0
2,0.067320,-0.061269,0.137173,0.646073,-0.507197,0.980053,0.848819,0.334770,0.097856,-0.29308,...,0,1,0,0,0,0,0,1,0,0
3,0.302516,-0.436639,-0.078371,0.646073,-0.507197,-1.859033,-0.682695,-0.572152,-0.494856,-0.29308,...,0,1,0,1,0,0,0,0,0,0
4,0.067320,0.689469,0.518814,1.355319,-0.507197,0.947040,0.753100,1.387248,0.468851,-0.29308,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2.419286,-2.266564,-1.043758,-1.481667,1.289537,-0.043338,-0.682695,-0.572152,-0.969026,-0.29308,...,0,1,0,0,0,0,0,1,0,0
1455,2.419286,-2.266564,-1.049083,-1.481667,-0.507197,-0.043338,-0.682695,-0.572152,-0.415828,-0.29308,...,0,1,0,1,0,0,0,0,0,0
1456,-0.873466,4.255477,1.246594,-0.772420,1.289537,-0.373465,0.561660,-0.572152,1.717937,-0.29308,...,0,1,0,1,0,0,0,0,0,0
1457,0.655311,-0.342796,0.034599,-0.772420,-0.507197,0.682939,0.370221,-0.572152,-0.229233,-0.29308,...,0,1,0,0,0,0,0,1,0,0


最后，通过**values**属性得到**NumPy**格式的数据，并转成**Tensor**方便后面的训练。

In [17]:
n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values,dtype=torch.float)
test_features = torch.tensor(all_features[n_train:].values,dtype=torch.float)
train_labels = torch.tensor(train_data.SalePrice.values,dtype=torch.float)

**训练模型**

使用一个基本得线性回归模型和平方损失函数来训练模型