In [2]:
import torch
import pandas as pd
import torch.nn as nn
from torch.nn import functional as F
from tqdm import tqdm
import numpy as np
from torch.utils import data
import wandb

class MLP(nn.Module):
    def __init__(self, in_features):
        super().__init__()
        self.layer1 = nn.Linear(in_features,256)
        self.layer2 = nn.Linear(256,64)
        self.out = nn.Linear(64,1)

    def forward(self, X):
        X = F.relu(self.layer1(X))
        X = F.relu(self.layer2(X))
        return self.out(X)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
criterion = nn.MSELoss()
def load_array(data_arrays, batch_size, is_train=True):  #@save
    """Construct a PyTorch data iterator."""
    dataset = data.TensorDataset(*data_arrays)
    return data.DataLoader(dataset, batch_size, shuffle=is_train)

def log_rmse(net, features, labels):
    # 为了在取对数时进一步稳定该值，将小于1的值设置为1
    clipped_preds = torch.clamp(net(features), 1, float('inf'))
    rmse = torch.sqrt(criterion(torch.log(clipped_preds),
                           torch.log(labels)))
    return rmse.item()

In [4]:
test_data = pd.read_csv('test.csv')
train_data = pd.read_csv('train.csv')
print("train_data and test_data shape",train_data.shape,test_data.shape)

train_data and test_data shape (47439, 41) (31626, 40)


In [5]:
redundant_cols = ['Address', 'Summary', 'City', 'State']
for c in redundant_cols:
    del test_data[c], train_data[c]

# 数据预处理
large_vel_cols = ['Lot', 'Total interior livable area', 'Tax assessed value', 'Annual tax amount', 'Listed Price', 'Last Sold Price']
for c in large_vel_cols:
    train_data[c] = np.log(train_data[c]+1)
    test_data[c] = np.log(test_data[c]+1)

In [6]:
# 删除部分特征(ID,Address,summary)
all_features = pd.concat((train_data, test_data))

In [7]:
# 查询数字列 ->缺失数据赋0 -> 归一化
numeric_features = all_features.dtypes[all_features.dtypes == 'float64'].index
all_features = all_features.fillna(method='bfill', axis=0).fillna(0)
all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / (x.std()))

  all_features = all_features.fillna(method='bfill', axis=0).fillna(0)


In [8]:
for in_object in all_features.dtypes[all_features.dtypes=='object'].index:
    print(in_object.ljust(20),len(all_features[in_object].unique()))

Type                 174
Heating              2658
Cooling              909
Parking              9911
Bedrooms             277
Region               1258
Elementary School    3567
Middle School        809
High School          921
Flooring             1738
Heating features     1761
Cooling features     594
Appliances included  11289
Laundry features     3029
Parking features     9693
Listed On            2815
Last Sold On         6948


In [9]:
features = list(numeric_features)
# 加上类别数相对较少的Type
features.extend(['Type','Bedrooms'])
all_features = all_features[features]

In [10]:
print('before one hot code',all_features.shape)
all_features = pd.get_dummies(all_features,dummy_na=True)
print('after one hot code',all_features.shape)

non_float_cols = all_features.dtypes[all_features.dtypes!='float'].index
non_bool_cols = all_features.dtypes[all_features.dtypes!='bool'].index
non_complex_cols = all_features.dtypes[all_features.dtypes!='complex'].index
non_int_cols = all_features.dtypes[all_features.dtypes!='int'].index
non_numeric_cols = non_float_cols.intersection(non_bool_cols).intersection(non_complex_cols).intersection(non_int_cols)
print(non_numeric_cols)

before one hot code (79065, 20)
after one hot code (79065, 471)
Index([], dtype='object')


In [11]:
all_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 79065 entries, 0 to 31625
Columns: 471 entries, Sold Price to Bedrooms_nan
dtypes: bool(453), float64(18)
memory usage: 45.6 MB


In [12]:
# 将 'bool' 列转换为 'int' 类型
all_features = all_features.astype({col: 'int' for col in all_features.columns if all_features[col].dtype == 'bool'})

n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float)
print('train feature shape:', train_features.shape)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float)
print('test feature shape:', test_features.shape)
train_labels = torch.tensor(train_data['Sold Price'].values.reshape(-1, 1), dtype=torch.float)
print('train label shape:', train_labels.shape)

train feature shape: torch.Size([47439, 471])
test feature shape: torch.Size([31626, 471])
train label shape: torch.Size([47439, 1])


In [13]:
# print(all_features.dtypes)

In [14]:
in_features = train_features.shape[1]
net = MLP(in_features).to(device)

def train(net, train_features, train_labels, test_features, test_labels,
          num_epochs, learning_rate, weight_decay, batch_size):
    wandb.watch(net)
    train_ls, test_ls = [], []
    train_iter = load_array((train_features, train_labels), batch_size)
    # 这里使用的是Adam优化算法
    optimizer = torch.optim.Adam(net.parameters(), lr = learning_rate, weight_decay = weight_decay)
    for epoch in tqdm(range(num_epochs)):
        for X, y in train_iter:
            X, y = X.to(device), y.to(device)
            optimizer.zero_grad()
            outputs = net(X)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()
        record_loss = log_rmse(net.to('cpu'), train_features, train_labels)
        wandb.log({'loss': record_loss,'epoch': epoch})
        train_ls.append(record_loss)
        if (epoch%NUM_SAVE==0 and epoch!=0) or (epoch==num_epochs-1):
            torch.save(net.state_dict(),'checkpoint_'+str(epoch))
            print('save checkpoints on:', epoch, 'rmse loss value is:', record_loss)
        del X, y
        net.to(device)
    wandb.finish()
    return train_ls, test_ls

# 初始化wandb 进行记录
num_epochs, lr, weight_decay, batch_size = 500, 0.005, 0.05, 256
wandb.init(project="kaggle_predict",
           config={ "learning_rate": lr,
                    "weight_decay": weight_decay,
                    "batch_size": batch_size,
                    "total_run": num_epochs,
                    "network": net}
          )
print("network:",net)

[34m[1mwandb[0m: Currently logged in as: [33mhangyudai[0m. Use [1m`wandb login --relogin`[0m to force relogin


network: MLP(
  (layer1): Linear(in_features=471, out_features=256, bias=True)
  (layer2): Linear(in_features=256, out_features=64, bias=True)
  (out): Linear(in_features=64, out_features=1, bias=True)
)


In [15]:
NUM_SAVE = 50  # 每隔 10 个 epoch 保存一次模型的检查点

train_ls, valid_ls = train(net, train_features,train_labels,None,None, num_epochs, lr, weight_decay, batch_size)

# 使用现有训练好的net
net.to(device)
# 将网络应用于测试集。
preds = net(test_features).detach().numpy()

# 将其重新格式化以导出到Kaggle
test_data['Sold Price'] = pd.Series(preds.reshape(1, -1)[0])
submission = pd.concat([test_data['Id'], test_data['Sold Price']], axis=1)
submission.to_csv('submission.csv', index=False)

 10%|██████████▊                                                                                               | 51/500 [01:07<09:43,  1.30s/it]

save checkpoints on: 50 rmse loss value is: 0.017087794840335846


 20%|█████████████████████▏                                                                                   | 101/500 [02:15<09:07,  1.37s/it]

save checkpoints on: 100 rmse loss value is: 0.006886149290949106


 30%|███████████████████████████████▋                                                                         | 151/500 [03:22<07:34,  1.30s/it]

save checkpoints on: 150 rmse loss value is: 0.007267992477864027


 40%|██████████████████████████████████████████▏                                                              | 201/500 [04:28<06:37,  1.33s/it]

save checkpoints on: 200 rmse loss value is: 0.005585211329162121


 50%|████████████████████████████████████████████████████▋                                                    | 251/500 [05:36<05:36,  1.35s/it]

save checkpoints on: 250 rmse loss value is: 0.005913734436035156


 60%|███████████████████████████████████████████████████████████████▏                                         | 301/500 [06:42<04:21,  1.31s/it]

save checkpoints on: 300 rmse loss value is: 0.0027881860733032227


 70%|█████████████████████████████████████████████████████████████████████████▋                               | 351/500 [07:50<03:19,  1.34s/it]

save checkpoints on: 350 rmse loss value is: 0.0037927052471786737


 80%|████████████████████████████████████████████████████████████████████████████████████▏                    | 401/500 [08:56<02:12,  1.34s/it]

save checkpoints on: 400 rmse loss value is: 0.002963532693684101


 90%|██████████████████████████████████████████████████████████████████████████████████████████████▋          | 451/500 [10:03<01:04,  1.32s/it]

save checkpoints on: 450 rmse loss value is: 0.004835514351725578


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [11:08<00:00,  1.34s/it]

save checkpoints on: 499 rmse loss value is: 0.005292905494570732





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,499.0
loss,0.00529
