In [1]:
import numpy as np
import torch
from sklearn import ensemble, metrics
import pandas as pd
import csv
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader

In [2]:
def GetDateLabel(ls):
    assert(len(ls) == 3)
    output = ''
    for mem in ls:
        output += str(mem)+'/'
    return output[:-1]

def GetDateLabelCvt(ip):
    num_list = [int(mem) for mem in ip.split('-')]
    return GetDateLabel(num_list)
    
class HotelBooking(Dataset):
    def __init__(self, fn, fn_label):
        data = pd.read_csv(fn)
        data_list = ['arrival_date_year', 'arrival_date_month', 'arrival_date_day_of_month']
        data_dict = {}
        data_len = len(data['arrival_date_year'])
        for idx in range(data_len):
            label = GetDateLabel([data[key][idx] for key in data_list])
            if not data_dict.get(label):
                data_dict[label] = []
            data_dict[label].append(idx)
            
        data_label = pd.read_csv(fn_label)
        label_len = len(data_label['arrival_date'])
        a, b = data_label['arrival_date'], data_label['label']
        self.label_dict = {GetDateLabelCvt(a[i]):b[i] for i in range(label_len)}
        self.data = data
        self.data_dict = data_dict
        self.date_list = [key for key in self.data_dict]
        self.get_datelist_idx = {idx:key for idx, key in enumerate(self.date_list)}
        self.len = len(self.date_list)
        
    def __getitem__(self, idx):
        index_list = []
        for i in range(max(0, idx-3), idx):
            key = self.get_datelist_idx[i]
            index_list += self.data_dict[key]
            
        if idx == 0:
            key = self.get_datelist_idx[idx]
            index_list += self.data_dict[key]
        
        tensor_list = []
        for index in index_list:
            output = self.data.iloc[index]
            output = [[out for out in output][1:]]
            output = torch.tensor(output)
            tensor_list.append(output)
        
        return torch.cat(tensor_list, 0), self.label_dict[self.get_datelist_idx[idx]]
    
    def __len__(self):
        return self.len

In [3]:
class net(nn.Module):
    def __init__(self):
        super(net, self).__init__()
        self.lstm = nn.LSTM(
            batch_first=True,
            input_size=245,
            hidden_size=1000,
            num_layers=10
        )
        self.fc = nn.Sequential(
            nn.Linear(1000, 500),
            nn.Dropout(0.5),
            nn.ReLU(),
            nn.Linear(500, 50),
            nn.Dropout(0.5),
            nn.ReLU(),
            nn.Linear(50, 10)
        )
        
    def forward(self, data):
        data, _ = self.lstm(data.float())
        data = data[:,-1]
        return self.fc(data)      

In [4]:
def test():
    model.eval()
    correct = 0
    with torch.no_grad():
        for batch_idx, (data, label) in enumerate(loader):
            data, label = data.cuda(), label.cuda().long()
            output = model(data)
            _, pred = torch.max(output, 1)
            correct += (pred == label).sum()
            if batch_idx % max(1, int(len(loader)/10)) == 0:
                print('Test progress: {}/{}'.format(batch_idx+1, len(loader)))
    return correct.item() / len(loader.dataset) 

def train(n_epoch = 100):
    best_acc = 0
    for epoch in range(n_epoch):
        model.train()
        for batch_idx, (data, label) in enumerate(loader):
            model.zero_grad()
            data, label = data.cuda(), label.cuda().long()
            output = model(data)
            loss = F.cross_entropy(output, label)
            loss.backward()
            optimizer.step()
            if batch_idx % max(1, int(len(loader)/20)) == 0:
                print('Epoch: {}/{}\tProgress: {:.2f}% ({}/{})\tLoss: {:.4f}'.format(epoch+1, n_epoch,
                      (batch_idx+1)*100./len(loader), batch_idx+1, len(loader), loss.item()))
                      
        acc = test()
        print('*****Accuracy: {:.4f}'.format(acc),'\n')
        if acc > best_acc:
            best_acc = acc
            th.saveModel('models/best.pth', net, optimizer)

In [5]:
model = net().cuda()
optimizer = optim.Adam(model.parameters(), lr=0.0001, betas=(0.5, 0.999))
train_set = HotelBooking('data/preprocessed/train_processed.csv', 'data/label.csv')
loader = DataLoader(train_set, batch_size=1, shuffle=True, num_workers=0)

In [None]:
train()

Epoch: 1/100	Progress: 0.16% (1/640)	Loss: 2.2599
Epoch: 1/100	Progress: 5.16% (33/640)	Loss: 2.2329
Epoch: 1/100	Progress: 10.16% (65/640)	Loss: 1.9877
Epoch: 1/100	Progress: 15.16% (97/640)	Loss: 2.8252
Epoch: 1/100	Progress: 20.16% (129/640)	Loss: 2.0381
Epoch: 1/100	Progress: 25.16% (161/640)	Loss: 2.4029
