# library import

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset,DataLoader
import torch.optim as optim
torch.manual_seed(1015)
# define 'device' to upload tensor in gpu
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# 데이터 불러오기

In [2]:
train = pd.read_csv("./open_data/train.1.csv", encoding = 'utf-8')
train.head()

Unnamed: 0,date,prize,num_competition,submission,isHoliday,사용자,세션,신규방문자,페이지뷰
0,2018-09-09,850.0,1.0,71.0,1,281,266,73,1826
1,2018-09-10,850.0,1.0,74.0,0,264,247,51,2092
2,2018-09-11,850.0,1.5,94.0,0,329,310,58,1998
3,2018-09-12,850.0,1.5,93.0,0,300,287,45,2595
4,2018-09-13,850.0,1.5,94.0,0,378,344,50,3845


In [3]:
train.shape

(792, 9)

# 데이터 수정

In [4]:
# scaling
mini = train.iloc[:,1:].min()
size = train.iloc[:,1:].max() - train.iloc[:,1:].min()
train.iloc[:,1:] = (train.iloc[:,1:] -  mini) / size # 모든 수를 1 이하로 scaling
input_window = 49
output_window = 21

window_x = np.zeros((train.shape[0] - (input_window + output_window), input_window, 8))
window_y = np.zeros((train.shape[0] - (input_window + output_window), output_window, 8))

for start in range(train.shape[0] - (input_window + output_window)):
    end = start + input_window    
    window_x[start,:, :] = train.iloc[start : end                , 1: ].values
    window_y[start,:, :] = train.iloc[end   : end + output_window, 1: ].values
# [--------------\-------\--]
# 현재행 ~ 현재행 + 30일 을 x로 줄때 
# 현재행 + 30 ~ 현재행 + 37을 y로 본다 

# model 생성

In [5]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size = input_size,
                            hidden_size = hidden_size,
                            num_layers = 3,
                            batch_first = True,
                            dropout = 0.3,
                            bidirectional = True,)
        self.time_fc = nn.Linear(hidden_size*2, 8)
    
    def forward(self, x_time):
#         print(x_time.shape)
        out_time, _ = self.lstm(x_time)
#         print(out_time.shape)
#         print(out_time[:,-7:, :].shape)
        out_time = self.time_fc(out_time[:,-21:, :])
#         print(out_time.shape,'\n')
        return out_time.view(-1,21,8)
    
model = LSTM(input_size = 8, hidden_size = 60).to(device)

# 학습

In [6]:
window_x.shape

(722, 49, 8)

In [7]:
window_y.shape

(722, 21, 8)

In [8]:
window_x = torch.tensor(window_x).float().to(device)
window_y = torch.tensor(window_y).float().to(device)

# Train model
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-2)
criterion = nn.MSELoss(size_average = True)
num_epochs  = 800
train_error = []
for t in range(num_epochs):
    model.train()
    optimizer.zero_grad()
        
    train_pred = model(window_x)
#     print('x.shape: ', train_pred.shape)
#     print('y.shape: ', window_y.shape)
    loss = criterion(train_pred, window_y) ### trend
    train_error.append(loss)
    loss.backward()
    optimizer.step()
    if t % 10 == 0 and t !=0:
        print(f"{t} Epochs train MSE: {loss.item():1.5f}")
        #0.00038 lr=1e-3

10 Epochs train MSE: 0.03643
20 Epochs train MSE: 0.02677
30 Epochs train MSE: 0.01454
40 Epochs train MSE: 0.01310
50 Epochs train MSE: 0.01211
60 Epochs train MSE: 0.01131
70 Epochs train MSE: 0.01107
80 Epochs train MSE: 0.01010
90 Epochs train MSE: 0.01290
100 Epochs train MSE: 0.01072
110 Epochs train MSE: 0.00983
120 Epochs train MSE: 0.00897
130 Epochs train MSE: 0.00935
140 Epochs train MSE: 0.00838
150 Epochs train MSE: 0.00776
160 Epochs train MSE: 0.00718
170 Epochs train MSE: 0.00937
180 Epochs train MSE: 0.00852
190 Epochs train MSE: 0.00791
200 Epochs train MSE: 0.00724
210 Epochs train MSE: 0.00689
220 Epochs train MSE: 0.00647
230 Epochs train MSE: 0.00615
240 Epochs train MSE: 0.00565
250 Epochs train MSE: 0.01018
260 Epochs train MSE: 0.00756
270 Epochs train MSE: 0.00659
280 Epochs train MSE: 0.00580
290 Epochs train MSE: 0.00528
300 Epochs train MSE: 0.00486
310 Epochs train MSE: 0.00429
320 Epochs train MSE: 0.00378
330 Epochs train MSE: 0.00368
340 Epochs train MS

# 예측

In [9]:
submission = pd.read_csv("./open_data/submission.csv", encoding = 'euc-kr') 

submission.insert(1, 'd', 0)
submission.insert(1, 'c', 0)
submission.insert(1, 'b', 0)
submission.insert(1, 'a', 0)
submission.head()

Unnamed: 0,DateTime,a,b,c,d,사용자,세션,신규방문자,페이지뷰
0,2020-11-09,0,0,0,0,0,0,0,0
1,2020-11-10,0,0,0,0,0,0,0,0
2,2020-11-11,0,0,0,0,0,0,0,0
3,2020-11-12,0,0,0,0,0,0,0,0
4,2020-11-13,0,0,0,0,0,0,0,0


In [None]:
size, mini

In [None]:
## input_window = 30
## output_window = 7

last_month = torch.tensor(window_x[-1,:,:][np.newaxis,...]).float().to(device) #[1, 30, 4]

for start in range((len(submission) - output_window)//7 + 2):
    print(last_month.shape)
    start = start * 7
    next_week = model(last_month)
    last_month = torch.cat((last_month[:,-23:], next_week), axis = 1)
    
    pred_week = next_week.cpu().detach().numpy().reshape(output_window,8)
    
    pred_week = pred_week * size.values + mini.values
    pred_week = pred_week.astype(int)

    if start/7 == (len(submission) - output_window)//7 + 1:
        submission.iloc[start :, 1:] = pred_week[-submission.iloc[start :, 1:].shape[0]:,:]
    else:
        submission.iloc[start : start + output_window, 1:] = pred_week
submission

# 예측파일 저장

In [11]:
del submission['a']
del submission['b']
del submission['c']
del submission['d']

In [12]:
submission.to_csv('submission.csv', index = False, encoding = 'euc-kr')

# 미래에 어떤 날이 휴일인지는 모름 -> 휴일데이터가 의미 있을까?

In [10]:
## input_window = 49
## output_window = 21

last_7week = torch.tensor(window_x[-1,:,:][np.newaxis,...]).float().to(device) # [1, 49, 4]

for start in range((len(submission) - output_window)//21 + 2):
#     print(last_7week.shape)
    start = start * 21
    next_3week = model(last_7week)
    last_7week = torch.cat([last_7week[:, -28:], next_3week], axis = 1)
    
    pred_3week = next_3week.cpu().detach().numpy().reshape(output_window,8)
    pred_3week = pred_3week * size.values + mini.values # 정규화 복원
    pred_3week = pred_3week.astype(int)
    
    if start/21 == (len(submission) - output_window)//21 + 1:
        submission.iloc[start :, 1:] = pred_3week[-submission.iloc[start :, 1:].shape[0]:,:]
    else:
        submission.iloc[start : start + output_window, 1:] = pred_3week
submission

Unnamed: 0,DateTime,a,b,c,d,사용자,세션,신규방문자,페이지뷰
0,2020-11-09,2212,3,183,0,1998,1978,601,40088
1,2020-11-10,3126,3,203,0,2676,2624,738,59318
2,2020-11-11,4541,4,219,0,3370,3394,1073,85239
3,2020-11-12,4701,4,190,0,2641,2605,714,63100
4,2020-11-13,4564,4,204,0,2761,2751,752,61921
...,...,...,...,...,...,...,...,...,...
56,2021-01-04,2494,3,225,0,2646,2619,652,54975
57,2021-01-05,2459,3,227,0,2366,2352,588,48710
58,2021-01-06,2350,3,199,0,2076,2050,503,42173
59,2021-01-07,2514,3,197,0,1983,1966,456,40759


In [None]:
## input_window = 60
## output_window = 30

last_2month = torch.tensor(window_x[-1,:,:][np.newaxis,...]).float().to(device) # [1, 60, 4]

for start in range((len(submission) - output_window)//30 + 2):
    start = start * 30
    next_month = model(last_2month)
    last_2month = torch.cat([last_2month[:, -30:, :], next_month], axis = 1)
    
    pred_month = next_month.cpu().detach().numpy().reshape(output_window,8)
    pred_month = pred_month * size.values + mini.values # 정규화 복원
    pred_month = pred_month.astype(int)
    
    if start/30 == (len(submission) - output_window)//30 + 1:
        submission.iloc[start :, 1:] = pred_month[-submission.iloc[start :, 1:].shape[0]:,:]
    else:
        submission.iloc[start : start + output_window, 1:] = pred_month
submission