In [20]:
import torch
import numpy as np
import pandas as pd
import sklearn.preprocessing as preprocessing
print(torch.__version__)

1.3.1


### 读取数据并对数据进行预处理

In [30]:
def load_preprocess_data():
    trainDataPath = "../data/kaggle_house/train.csv"          # 训练集包含1460个样本，每个样本有80个特征，一个label
    testDataPath = "../data/kaggle_house/test.csv"            # 测试集包含1460个样本，每个样本有80个特征，一个label
    trainData = pd.read_csv(trainDataPath)
    testData = pd.read_csv(testDataPath)
    train_features = trainData.iloc[:, 1:-1]
    test_features = testData.iloc[:, 1:]
    #print(train_features.iloc[:5, :])
    #print(test_features.iloc[:5, :])
    #print(train_features.shape, test_features.shape)
    all_features = pd.concat((train_features, test_features))  # 同时对训练集和测试集的数据进行预处理
    #print(all_features.shape)
    #print(all_features)
    numical_features_index = all_features.dtypes[all_features.dtypes != 'object'].index    # 数值的dtypes非object类型，string.dtypes = 'object'
    #print(all_features.dtypes)
    #print(len(numical_features))
    #print(all_features[numical_features_index].shape)
    #将数值型特征归一化处理
    scaler = preprocessing.StandardScaler()
    all_features[numical_features_index] = standard_features = scaler.fit_transform(all_features[numical_features_index])
    all_features[numical_features_index] = all_features[numical_features_index].fillna(0)
    #print(all_features.iloc[:5, 1:10])
    object_features_index = all_features.dtypes[all_features.dtypes == 'object'].index
    #print(all_features[object_features_index].info())
    # dummy_na=True将缺失值也当作合法的特征值并为其创建指示特征
    all_features = pd.get_dummies(all_features, dummy_na=True)
    print(all_features.info())
    print(trainData.SalePrice[:10])
    # 将预处理后的数据转换为tensor
    train_features = torch.tensor(all_features[:trainData.shape[0]].values, dtype=torch.float).cuda()
    print(train_features)
    train_labels = torch.tensor(trainData.SalePrice.values, dtype=torch.float).view(-1, 1).cuda()
    print(train_labels)
    test_features = torch.tensor(all_features[trainData.shape[0]:].values, dtype=torch.float).cuda()
    print(test_features)
    return train_features, train_labels, test_features

In [31]:
train_features, train_labels, test_features = load_preprocess_data()
print(train_labels[:10])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2919 entries, 0 to 1458
Columns: 331 entries, MSSubClass to SaleCondition_nan
dtypes: float64(36), uint8(295)
memory usage: 1.6 MB
None
0    208500
1    181500
2    223500
3    140000
4    250000
5    143000
6    307000
7    200000
8    129900
9    118000
Name: SalePrice, dtype: int64
tensor([[ 0.0673, -0.1845, -0.2179,  ...,  1.0000,  0.0000,  0.0000],
        [-0.8736,  0.4582, -0.0720,  ...,  1.0000,  0.0000,  0.0000],
        [ 0.0673, -0.0559,  0.1372,  ...,  1.0000,  0.0000,  0.0000],
        ...,
        [ 0.3026, -0.1416, -0.1428,  ...,  1.0000,  0.0000,  0.0000],
        [-0.8736, -0.0559, -0.0572,  ...,  1.0000,  0.0000,  0.0000],
        [-0.8736,  0.2440, -0.0293,  ...,  1.0000,  0.0000,  0.0000]],
       device='cuda:0')
tensor([[208500.],
        [181500.],
        [223500.],
        ...,
        [266500.],
        [142125.],
        [147500.]], device='cuda:0')
tensor([[-0.8736,  0.4582,  0.1844,  ...,  1.0000,  0.0000,  

### 定义模型

In [32]:
class LinearModel(torch.nn.Module):
    def __init__(self, num_input, num_output):
        super(LinearModel, self).__init__()
        self.linear = torch.nn.Linear(num_input, num_output)
    def forward(self, x):
        y = self.linear(x)
        return y

### 定义损失函数

In [33]:
loss = torch.nn.MSELoss()

### 对数均方根误差

In [34]:
def log_rmse(pre_labels, labels):
    #将小于1.0的值设为１，使数值稳定
    pre_labels = torch.max(pre_labels, torch.tensor(1.0).cuda())
    rmse = torch.sqrt(2 * loss(pre_labels.log(), labels.log()).mean()).cuda()
    return rmse.item()

### 训练模型

In [47]:
def training_loop(net, nepochs, train_data, train_labels, test_data, test_labels, batch_size=64):
    dataset = torch.utils.data.TensorDataset(train_data, train_labels)
    train_iter = torch.utils.data.DataLoader(dataset, batch_size, shuffle=True, pin_memory=False)
    train_ls, test_ls = 0, 0
    net = net.float()
    optimizer = torch.optim.Adam(params=net.parameters(), lr=5, weight_decay=1)
    for epoch in range(nepochs):
        for X, y in train_iter:
            y_pre = net(X)
            l = loss(y_pre, y)
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
        train_ls = log_rmse(y_pre, y)
        test_ls = log_rmse(net(test_data), test_labels)
    print('epoch: %d, train_ls: %2f, test_ls: %2f' % (epoch, train_ls, test_ls))
    return train_ls, test_ls

### 得到第i个Ｋ折交叉验证集数据

In [48]:
def get_k_fold_data(k, i, X, y):
    # 返回第i折交叉验证时所需要的训练和验证数据
    assert k > 1
    fold_size = X.shape[0] // k
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part = X[idx, :], y[idx]
        if j == i:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = torch.cat((X_train, X_part), dim=0)
            y_train = torch.cat((y_train, y_part), dim=0)
    return X_train, y_train, X_valid, y_valid

### K折交叉验证

In [49]:
def get_net(feature_num):
    net = torch.nn.Linear(feature_num, 1)
    for param in net.parameters():
        torch.nn.init.normal_(param, mean=0, std=0.01)
    return net

def k_fold(k, X, y):
    train_l_sum, valid_l_sum = 0.0, 0.0
    for i in range(k):
        train_data, train_labels, test_data, test_labels = get_k_fold_data(k, i, X, y)
        
        # net = LinearModel(X.shape[1], 1)
        net = get_net(X.shape[1])
        cuda_gpu = torch.cuda.is_available()   #判断GPU是否存在可用
        if cuda_gpu:
            print("将模型迁移到cuda")
            torch.nn.DataParallel(net, device_ids=[0]).cuda()
        for param in net.parameters():
            torch.nn.init.normal_(param, mean=0, std=0.1)
        train_ls, test_ls = training_loop(net, 100, 
                                          train_data, train_labels, test_data, test_labels, batch_size=64)
        train_l_sum += train_ls
        valid_l_sum += test_ls
    return train_l_sum / k, valid_l_sum / k

In [50]:
if __name__ == "__main__":
    X, y, test_features = load_preprocess_data()
    print(y[:10])
    train_l, valid_l = k_fold(5, X, y)
    print('%d-fold validation: avg train rmse %f, avg valid rmse %f' % (5, train_l, valid_l))
    
    

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2919 entries, 0 to 1458
Columns: 331 entries, MSSubClass to SaleCondition_nan
dtypes: float64(36), uint8(295)
memory usage: 1.6 MB
None
0    208500
1    181500
2    223500
3    140000
4    250000
5    143000
6    307000
7    200000
8    129900
9    118000
Name: SalePrice, dtype: int64
tensor([[ 0.0673, -0.1845, -0.2179,  ...,  1.0000,  0.0000,  0.0000],
        [-0.8736,  0.4582, -0.0720,  ...,  1.0000,  0.0000,  0.0000],
        [ 0.0673, -0.0559,  0.1372,  ...,  1.0000,  0.0000,  0.0000],
        ...,
        [ 0.3026, -0.1416, -0.1428,  ...,  1.0000,  0.0000,  0.0000],
        [-0.8736, -0.0559, -0.0572,  ...,  1.0000,  0.0000,  0.0000],
        [-0.8736,  0.2440, -0.0293,  ...,  1.0000,  0.0000,  0.0000]],
       device='cuda:0')
tensor([[208500.],
        [181500.],
        [223500.],
        ...,
        [266500.],
        [142125.],
        [147500.]], device='cuda:0')
tensor([[-0.8736,  0.4582,  0.1844,  ...,  1.0000,  0.0000,  