In [None]:
import torch
from torch import nn

In [None]:
class Net(nn.Module):
    def __init__(self, n_features, dropout=.5):
        super(Net, self).__init__()
        self.n_features = n_features
        self.regressor = nn.Sequential(
            nn.Linear(n_features, 1024),
            nn.LeakyReLU(),
            nn.Dropout(dropout),
            nn.Linear(1024, 1024),
            nn.LeakyReLU(),
            nn.Dropout(dropout),
            nn.Linear(1024, 1024),
            nn.LeakyReLU(),
            nn.Dropout(dropout),
            nn.Linear(1024, 1024),
            nn.LeakyReLU(),
            nn.Dropout(dropout),
            nn.Linear(1024, 512),
            nn.LeakyReLU(),
            nn.Dropout(dropout),
            nn.Linear(512, 512),
            nn.LeakyReLU(),
            nn.Dropout(dropout),
            nn.Linear(512, 256),
            nn.LeakyReLU(),
            nn.Dropout(dropout),
            nn.Linear(256, 64),
            nn.LeakyReLU(),
            nn.Linear(64, 1),
        )
    def forward(self, inputs):
        return self.regressor(inputs)

In [None]:
from torch.utils import data

class Dataset(data.Dataset):
    def __init__(self, x, y=None):
        self.x = x
        self.y = y
    def __getitem__(self, idx):
        if self.y is None: return self.x[idx]
        return self.x[idx], self.y[idx]
    def __len__(self):
        return len(self.x)

In [None]:
import torch.optim as optim
import torch.nn.functional as F

def training(model, train, valid, n_epoch, lr, device, model_dir, weight_decay=0):
    loss_train_history, loss_valid_history = [], []
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print('\nstart training, parameter total:{}, trainable:{}\n'.format(total, trainable))
    model.train() # 將 model 的模式設為 train，這樣 optimizer 就可以更新 model 的參數
    criterion = nn.MSELoss() # 定義損失函數，這裡我們使用 mse loss
    t_batch = len(train) 
    v_batch = len(valid) 
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay, amsgrad=True) # 將模型的參數給 optimizer，並給予適當的 learning rate
#     optimizer = optim.SGD(model.parameters(), lr=lr, momentum=.9, weight_decay=.0001)
    best_loss = 0
    for epoch in range(n_epoch):
        total_loss = 0
        # 這段做 training
        for i, (x, y) in enumerate(train):
            x = x.to(device, dtype=torch.float) # device 為 "cuda"，將 x 轉成 torch.cuda.LongTensor
            y = y.to(device, dtype=torch.float) # device為 "cuda"，將 y 轉成 torch.cuda.LongTensor
            optimizer.zero_grad() # 由於 loss.backward() 的 gradient 會累加，所以每次餵完一個 batch 後需要歸零
            outputs = model(x) # 將 x 餵給模型
            loss = criterion(outputs, y) # 計算此時模型的 training loss
            current_loss = loss.item()
            loss.backward() # 算 loss 的 gradient
            optimizer.step() # 更新訓練模型的參數
            total_loss += current_loss
            print('[ Epoch{}: {}/{} ] loss:{:.3f} '.format(
            	epoch+1, i+1, t_batch, current_loss), end='\r')
        loss_train_history.append(total_loss/t_batch)
        print('\nTrain | Loss:{:.5f}'.format(total_loss/t_batch))

        # 這段做 validation
        model.eval() # 將 model 的模式設為 eval，這樣 model 的參數就會固定住
        with torch.no_grad():
            total_loss = 0
            for i, (x, y) in enumerate(valid):
                x = x.to(device, dtype=torch.float) # device 為 "cuda"，將 x 轉成 torch.cuda.LongTensor
                y = y.to(device, dtype=torch.float) # device為 "cuda"，將 y 轉成 torch.cuda.LongTensor
                optimizer.zero_grad() # 由於 loss.backward() 的 gradient 會累加，所以每次餵完一個 batch 後需要歸零
                outputs = model(x) # 將 x 餵給模型
                loss = criterion(outputs, y) # 計算此時模型的 training loss
                total_loss += loss.item()

            loss_valid_history.append(total_loss/v_batch)
            print("Valid | Loss:{:.5f} ".format(total_loss/v_batch))
            if epoch == 0 or total_loss < best_loss:
                # 如果 validation 的結果優於之前所有的結果，就把當下的模型存下來以備之後做預測時使用
                best_loss = total_loss
                torch.save(model, "{}/ckpt.model".format(model_dir))
                print('saving model with loss {:.3f}'.format(total_loss/v_batch))
        print('-----------------------------------------------')
        model.train() # 將 model 的模式設為 train，這樣 optimizer 就可以更新 model 的參數（因為剛剛轉成 eval 模式）
    return loss_train_history, loss_valid_history

In [None]:
from HTML import config as Config
from HTML.dataset import *
from HTML.config import ratio, nominal, ordinal, meaningless
from HTML.preprocessing import preprocessing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df_train = pd.read_csv(Config.train_path)
df_train.head()

In [None]:
df_train.shape

In [None]:
df_train = preprocessing(df_train)

In [None]:
df_train.shape

In [None]:
y_adr = df_train[df_train['adr'] < 5000]['adr'].to_numpy()

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
enc = OneHotEncoder(handle_unknown='ignore', sparse=False)

In [None]:
categories = df_train[sorted(list(set(nominal+ordinal) - {'company', 'adr', 'reservation_status', 'reservation_status_date', 'is_canceled', 'ID', 'index'}))].astype(str).to_numpy()
cat = enc.fit_transform(categories)

In [None]:
cat.shape

In [None]:
num = df_train[sorted(list(set(df_train.columns) - set(nominal+ordinal) - {'company','adr', 'reservation_status', 'reservation_status_date', 'is_canceled', 'ID', 'index'}))].to_numpy()

In [None]:
num.shape

In [None]:
tmp = np.concatenate((cat, num), axis=1)
x_adr = tmp[df_train['adr'] < 5000,:].copy()

In [None]:
print('adr:', x_adr.shape, y_adr.shape)

In [None]:
from sklearn.model_selection import train_test_split

x_train_adr, x_valid_adr, y_train_adr, y_valid_adr = train_test_split(x_adr, y_adr, test_size=.2, random_state=1126)

In [None]:
# from sklearn.preprocessing import Normalizer

# transformer = Normalizer().fit(x_train_adr)
# x_train_adr = transformer.transform(x_train_adr)
# x_valid_adr = transformer.transform(x_valid_adr)

In [None]:
print('train:', x_train_adr.shape, y_train_adr.shape)
print('valid:', x_valid_adr.shape, y_valid_adr.shape)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
device

In [None]:
x_train = torch.LongTensor(x_train_adr)
y_train = torch.LongTensor(y_train_adr).view((-1,1))
x_valid = torch.LongTensor(x_valid_adr)
y_valid = torch.LongTensor(y_valid_adr).view((-1,1))

In [None]:
print('train:', x_train.shape, y_train.shape)
print('valid:', x_valid.shape, y_valid.shape)

In [None]:
train_dataset = Dataset(x_train, y_train)
valid_dataset = Dataset(x_valid, y_valid)

In [None]:
batch_size = 64

In [None]:
train_loader = torch.utils.data.DataLoader(dataset = train_dataset,
                                           batch_size = batch_size,
                                           shuffle = True,
                                           num_workers = 8)
valid_loader = torch.utils.data.DataLoader(dataset = valid_dataset,
                                           batch_size = batch_size,
                                           shuffle = False,
                                           num_workers = 8)

In [None]:
import random

seed = 1126
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True

n_epoch = 5
lr = .0001
model_dir = '../models'

model = Net(n_features=x_train.shape[1], dropout=0)
model = model.to(device)
loss_train_history, loss_valid_history = training(model, train_loader, valid_loader,
                                                  n_epoch, lr, device, model_dir)

In [None]:
# plt.figure(figsize=(12,8))
plt.plot(np.arange(1,n_epoch+1), loss_train_history)
plt.plot(np.arange(1,n_epoch+1), loss_valid_history)
plt.title('DNN - Loss')
plt.xlabel('Iter Time')
plt.ylabel('Loss')
plt.legend(['train', 'validation'])
plt.show()

n_epoch = 5

lr = .001

Train | Loss:779.06220

Valid | Loss:669.05400

In [None]:
import os
model_dir = '../models'
model = torch.load(os.path.join(model_dir, 'ckpt.model'))

In [None]:
def testing(model, test_loader, device, test=True):
    model.eval()
    result = []
    with torch.no_grad():
        for i, item in enumerate(test_loader):
            if test:
                x = item
            else:
                (x, _) = item
            x = x.to(device, dtype=torch.float)
            result += model(x).float().tolist()
    return result

In [None]:
pred_train = testing(model, train_dataset, device, test=False)
pred_train = np.array(pred_train).T

In [None]:
np.mean(np.abs(pred_train - y_train_adr))

In [None]:
np.mean((pred_train - y_train_adr) ** 2)

In [None]:
plt.scatter(pred_train, y_train_adr, alpha=.1)
plt.show()

In [None]:
pred_valid = testing(model, valid_dataset, device, test=False)
pred_valid = np.array(pred_valid).T

In [None]:
np.mean(np.abs(pred_valid - y_valid_adr))

In [None]:
np.mean((pred_valid - y_valid_adr) ** 2)

In [None]:
plt.scatter(pred_valid, y_valid_adr, alpha=.1)
plt.show()

In [None]:
df_test = pd.read_csv(Config.test_path)
df_test.head()

In [None]:
df_test.shape

In [None]:
categories = df_test[sorted(list(set(nominal+ordinal) - {'company', 'adr', 'reservation_status', 'reservation_status_date', 'is_canceled', 'ID', 'index'}))].astype(str).to_numpy()
cat = enc.transform(categories)

In [None]:
cat.shape

In [None]:
num = df_test[sorted(list(set(df_test.columns) - set(nominal+ordinal) - {'company', 'adr', 'reservation_status', 'reservation_status_date', 'is_canceled', 'ID', 'index'}))].to_numpy()

In [None]:
num.shape

In [None]:
tmp = np.concatenate((cat, num), axis=1)
x_test = tmp.copy()

In [None]:
x_test = torch.LongTensor(x_test)

In [None]:
test_dataset = Dataset(x_test)

In [None]:
test_loader = torch.utils.data.DataLoader(dataset = test_dataset,
                                          batch_size = 128,
                                          shuffle = False,
                                          num_workers = 8)

In [None]:
pred = testing(model, test_loader, device)
pred = np.array(pred).T