In [24]:
import os
import pandas as pd
import torch

### 2.2.1 读取数据集

In [19]:
# 创建数据集
os.makedirs(os.path.join('..', 'data'), exist_ok=True)
data_file = os.path.join('..', 'data', 'house_tiny.csv')

with open(data_file, 'w') as f:
    # write内容里面不要有空格
    f.write('NumRooms, Alley, Price\n')
    f.write('NA,Pave,12750\n')
    f.write('2,NA,106000\n')
    f.write('4,NA,178100\n')
    f.write('NA,NA,140000\n')

In [21]:
# 读取数据集
data = pd.read_csv(data_file)
print(data)

   NumRooms  Alley   Price
0       NaN   Pave   12750
1       2.0    NaN  106000
2       4.0    NaN  178100
3       NaN    NaN  140000


### 2.2.2 处理缺失值

In [22]:
# DataFrame.iloc[]方法用于索引切片
inputs, outputs = data.iloc[:, 0:2], data.iloc[:, 2:]
# 用均值填充离散的缺失值，numeric_only=True表示只对数值部分取均值
inputs = inputs.fillna(inputs.mean(numeric_only=True))
print(inputs)
print(outputs)

   NumRooms  Alley
0       3.0   Pave
1       2.0    NaN
2       4.0    NaN
3       3.0    NaN
    Price
0   12750
1  106000
2  178100
3  140000


In [23]:
# 将非数值部分转换为独热码，dummy_na=True表示为NA单独编码
inputs = pd.get_dummies(inputs, dummy_na=True)
print(inputs)

   NumRooms   Alley_Pave   Alley_nan
0       3.0            1           0
1       2.0            0           1
2       4.0            0           1
3       3.0            0           1


### 2.2.3 转换成张量格式

In [25]:
X = torch.Tensor(inputs.to_numpy(dtype=float))
y = torch.Tensor(outputs.to_numpy(dtype=float))
X, y

(tensor([[3., 1., 0.],
         [2., 0., 1.],
         [4., 0., 1.],
         [3., 0., 1.]]),
 tensor([[ 12750.],
         [106000.],
         [178100.],
         [140000.]]))