In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from torch import nn

In [3]:
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")

In [4]:
print(train)

        Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0        1          60       RL         65.0     8450   Pave   NaN      Reg   
1        2          20       RL         80.0     9600   Pave   NaN      Reg   
2        3          60       RL         68.0    11250   Pave   NaN      IR1   
3        4          70       RL         60.0     9550   Pave   NaN      IR1   
4        5          60       RL         84.0    14260   Pave   NaN      IR1   
...    ...         ...      ...          ...      ...    ...   ...      ...   
1455  1456          60       RL         62.0     7917   Pave   NaN      Reg   
1456  1457          20       RL         85.0    13175   Pave   NaN      Reg   
1457  1458          70       RL         66.0     9042   Pave   NaN      Reg   
1458  1459          20       RL         68.0     9717   Pave   NaN      Reg   
1459  1460          20       RL         75.0     9937   Pave   NaN      Reg   

     LandContour Utilities  ... PoolArea PoolQC  Fe

In [5]:
print(test)

        Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0     1461          20       RH         80.0    11622   Pave   NaN      Reg   
1     1462          20       RL         81.0    14267   Pave   NaN      IR1   
2     1463          60       RL         74.0    13830   Pave   NaN      IR1   
3     1464          60       RL         78.0     9978   Pave   NaN      IR1   
4     1465         120       RL         43.0     5005   Pave   NaN      IR1   
...    ...         ...      ...          ...      ...    ...   ...      ...   
1454  2915         160       RM         21.0     1936   Pave   NaN      Reg   
1455  2916         160       RM         21.0     1894   Pave   NaN      Reg   
1456  2917          20       RL        160.0    20000   Pave   NaN      Reg   
1457  2918          85       RL         62.0    10441   Pave   NaN      Reg   
1458  2919          60       RL         74.0     9627   Pave   NaN      Reg   

     LandContour Utilities  ... ScreenPorch PoolAre

In [6]:
# pd.concat是将数据框连接在一起，默认是将第二个数据框连在第一个的下方，即按行方向连接，如果想要按列方向连接，需要设置axis=1
features = pd.concat((train.drop(columns=['Id', 'SalePrice']), test.drop(columns=['Id'])))
# 这里用到了布尔索引，其实布尔索引就是一串布尔数组，True就取，False就不取，features.dtypes返回序列，所以利用不等式能创建布尔索引
numeric_features = features.dtypes[features.dtypes != 'object'].index
# apply是对列表中的每个元素都应用括号里的函数，比如a = [1, 2, 3]，a.apply(lambda x: x + 1)得到的是[2, 3, 4]
features[numeric_features] = features[numeric_features].apply(lambda x: (x - x.mean()) / x.std())
features[numeric_features] = features[numeric_features].fillna(0)
# 自动进行独热编码，对于nan值也要单独生成一列
features = pd.get_dummies(features, dummy_na=True)

train_features = features[:len(train)].copy()
test_features = features[len(train):].copy()

In [7]:
print(train_features)

      MSSubClass  LotFrontage   LotArea  OverallQual  OverallCond  YearBuilt  \
0       0.067320    -0.184443 -0.217841     0.646073    -0.507197   1.046078   
1      -0.873466     0.458096 -0.072032    -0.063174     2.187904   0.154737   
2       0.067320    -0.055935  0.137173     0.646073    -0.507197   0.980053   
3       0.302516    -0.398622 -0.078371     0.646073    -0.507197  -1.859033   
4       0.067320     0.629439  0.518814     1.355319    -0.507197   0.947040   
...          ...          ...       ...          ...          ...        ...   
1455    0.067320    -0.312950 -0.285421    -0.063174    -0.507197   0.914028   
1456   -0.873466     0.672275  0.381246    -0.063174     0.391170   0.220763   
1457    0.302516    -0.141607 -0.142781     0.646073     3.086271  -1.000704   
1458   -0.873466    -0.055935 -0.057197    -0.772420     0.391170  -0.703591   
1459   -0.873466     0.243916 -0.029303    -0.772420     0.391170  -0.208401   

      YearRemodAdd  MasVnrArea  BsmtFin

In [8]:
print(test_features)

      MSSubClass  LotFrontage   LotArea  OverallQual  OverallCond  YearBuilt  \
0      -0.873466     0.458096  0.184340    -0.772420     0.391170  -0.340452   
1      -0.873466     0.500932  0.519702    -0.063174     0.391170  -0.439490   
2       0.067320     0.201080  0.464294    -0.772420    -0.507197   0.848003   
3       0.067320     0.372424 -0.024105    -0.063174     0.391170   0.881015   
4       1.478499    -1.126832 -0.654636     1.355319    -0.507197   0.682939   
...          ...          ...       ...          ...          ...        ...   
1454    2.419286    -2.069222 -1.043758    -1.481667     1.289537  -0.043338   
1455    2.419286    -2.069222 -1.049083    -1.481667    -0.507197  -0.043338   
1456   -0.873466     3.884968  1.246594    -0.772420     1.289537  -0.373465   
1457    0.655311    -0.312950  0.034599    -0.772420    -0.507197   0.682939   
1458    0.067320     0.201080 -0.068608     0.646073    -0.507197   0.715952   

      YearRemodAdd  MasVnrArea  BsmtFin

In [28]:
label = pd.DataFrame(train['SalePrice'])
labeldata = torch.tensor(label.values.astype(float), dtype=torch.float32)
labelmean = labeldata.mean()
labelstd = labeldata.std()
labeldata = (labeldata -labelmean) / labelstd
print(labelmean, labelstd)

tensor(180921.1875) tensor(79442.5000)


In [10]:
device = 'cuda:1'
batch_size = 1
epoches = 100
k = 10

In [11]:
class PriceTrainSet(Dataset):
    def __init__(self, device, inputs:pd.DataFrame, targets:torch.Tensor):
        super(PriceTrainSet, self).__init__()
        # 转换而来的ndarry是object型，需要astype手动转为float
        self.x = torch.tensor(inputs.values.astype(float), dtype=torch.float32).to(device)
        self.y = targets.clone().to(device)
        # print(self.x.shape, self.y.shape)

    def __getitem__(self, index):
        return self.x[index], self.y[index]

    def __len__(self):
        return len(self.x)

In [31]:
class PriceTestSet(Dataset):
    def __init__(self, device, inputs:pd.DataFrame):
        super(PriceTestSet, self).__init__()
        self.x = torch.tensor(inputs.values.astype(float), dtype=torch.float32).to(device)

    def __getitem__(self, index):
        return self.x[index]
    
    def __len__(self):
        return len(self.x)

In [13]:
traindatas = []
trainlabel = []
blocksize = len(train_features) // k
for i in range(k):
    traindatas.append(train_features[i * blocksize:(i + 1) * blocksize])
    trainlabel.append(labeldata[i * blocksize:(i + 1) * blocksize])

In [32]:
testset = PriceTestSet(device, test_features)
testloader = DataLoader(testset, batch_size=batch_size)

In [15]:
class Model(nn.Module):
    def __init__(self, inputs, outputs, hiddens, layers, dropout=0.1):
        super(Model, self).__init__()
        self.net = nn.Sequential()
        self.net.add_module(
            'input', 
            nn.Sequential(
                nn.Linear(inputs, hiddens), 
                nn.ReLU(), 
                nn.Dropout(dropout)
            )
        )
        
        for _ in range(layers - 2):
            self.net.add_module(
                'layer' + str(_ + 1), 
                nn.Sequential(
                    nn.Linear(hiddens, hiddens), 
                    nn.ReLU(), 
                    nn.Dropout(dropout)
                )
            )
        
        self.net.add_module('output', nn.Linear(hiddens, outputs))
    
    def forward(self, x):
        return self.net(x)

In [16]:
class Log:
    def __init__(self, filename):
        self.file = open(filename, 'w')
    
    def log(self, text):
        self.file.write(text)
        self.file.write('\n')

In [17]:
class Trainer:
    def __init__(self, model, optimizer, criterion, log=None):
        self.model = model
        self.optimizer = optimizer
        self.criterion = criterion
        self.log = log

    def train(self, num_epoch, trainloader, testloader, logargs=None):
        for i in range(num_epoch):
            loss = self.train_epoch(trainloader)
            print(f'train {i} epoch, loss: {loss}')
        if testloader is not None:
            testloss = self.test(testloader)
            print(f'test loss: {testloss}')
            if self.log is not None:
                self.log.log(logargs + ':' + str(testloss))

    def train_epoch(self, dataloader):
        total_loss = 0.0
        for x, y in dataloader:
            total_loss += self.train_step(x, y)
        return total_loss / len(dataloader)

    def train_step(self, inputs, targets):
        self.model.train()
        outputs = self.model(inputs)
        # print(outputs.shape, targets.shape)
        loss = self.criterion(outputs, targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        #print(outputs, targets)
        return loss.item()
    
    def test(self, dataloader):
        total_loss = 0.0
        for x, y in dataloader:
            total_loss += self.test_step(x, y)
        return total_loss / len(dataloader)
    
    def test_step(self, inputs, targets):
        self.model.eval()
        outputs = self.model(inputs)
        loss = self.criterion(outputs, targets)
        return loss.item()

In [18]:
hidden_layers = [512, 1024, 2048]
lrs = [0.001]
layers = [3, 4, 5]

In [19]:
criterion = nn.MSELoss()
log = Log('log.txt')

In [20]:
inputs = 330
outputs = 1
i = 0
for hidden in hidden_layers:
        for layer in layers:
            for lr in lrs:
                print(f'training hidden_layers:{hidden}, layers:{layers}, lr:{lr}')
                model = Model(inputs, outputs, hidden, layer).to(device)
                optimizer = torch.optim.SGD(model.parameters(), lr)
                trainer = Trainer(model, optimizer, criterion, log)
                
                trainset = PriceTrainSet(
                     device, 
                     pd.concat([traindatas[j] for j in range(k) if j != i]), 
                     torch.concat([trainlabel[j] for j in range(k) if j != i])
                )
                valset = PriceTrainSet(device, traindatas[i], trainlabel[i])
                trainloader = DataLoader(trainset, batch_size=batch_size)
                valloader = DataLoader(valset, batch_size=batch_size)

                trainer.train(epoches, trainloader, valloader, f'hidden {hidden}, layer {layer}, lr {lr} ')
                i += 1

training hidden_layers:512, layers:[3, 4, 5], lr:0.001
train 0 epoch, loss: 0.43110285147711885
train 1 epoch, loss: 0.20739736834250275


KeyboardInterrupt: 

In [27]:
inputs = 330
outputs = 1
hidden = 512
model = Model(inputs, outputs, hidden, 3).to(device)
optimizer = torch.optim.SGD(model.parameters(), 0.001)
trainer = Trainer(model, optimizer, criterion)

trainset = PriceTrainSet(device, train_features, labeldata)
trainloader = DataLoader(trainset, batch_size=batch_size)

trainer.train(epoches, trainloader, None)

train 0 epoch, loss: 0.41652966200746516
train 1 epoch, loss: 0.18197963772997783
train 2 epoch, loss: 0.15816598282159552
train 3 epoch, loss: 0.13899329997684856
train 4 epoch, loss: 0.13052959575597803
train 5 epoch, loss: 0.11095228148575922
train 6 epoch, loss: 0.10040900867153454
train 7 epoch, loss: 0.09772185669629355
train 8 epoch, loss: 0.0819954177907814
train 9 epoch, loss: 0.08070270076151485
train 10 epoch, loss: 0.07446312772768537
train 11 epoch, loss: 0.07293954647976514
train 12 epoch, loss: 0.0701028553866953
train 13 epoch, loss: 0.06294925551713829
train 14 epoch, loss: 0.05090567452231701
train 15 epoch, loss: 0.056723945541290045
train 16 epoch, loss: 0.049949312264562576
train 17 epoch, loss: 0.04631707196461671
train 18 epoch, loss: 0.04718162522574634
train 19 epoch, loss: 0.04447514331666519
train 20 epoch, loss: 0.039323063952116585
train 21 epoch, loss: 0.04057228676375422
train 22 epoch, loss: 0.040443237532734495
train 23 epoch, loss: 0.037809606851508894

In [40]:
id = []
y = []
mean = labelmean.item()
std = labelstd.item()
for i in range(len(testset)):
    id.append(i + 1461)    
    y_hat = model(testset[i])
    y.append(y_hat.item() * std + mean)
df = pd.DataFrame(id, columns=['Id'])
df['SalePrice'] = y
df.to_csv('res.csv', index=False)