In [1]:
import random
import pandas as pd
import numpy as np
import os
import re

import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from tqdm.auto import tqdm

# 데이터 셋을 전반적으로 다 보여주는 line
pd.options.display.max_rows = 2000

import warnings
warnings.filterwarnings(action='ignore') 

In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [3]:
CFG = {
    'X_RANGE':3, # 예측하려는 시점을 X_RANGE 만큼의 이전 일자의 데이터로부터 예측 및 학습
    'EPOCHS':100,
    'LEARNING_RATE':1e-2,
    'BATCH_SIZE':128,
    'SEED':42
}

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

In [5]:
train_df = pd.read_csv('./train.csv')

In [6]:
train_df

Unnamed: 0,일시,광진구,동대문구,성동구,중랑구
0,20180101,0.592,0.368,0.58,0.162
1,20180102,0.84,0.614,1.034,0.26
2,20180103,0.828,0.576,0.952,0.288
3,20180104,0.792,0.542,0.914,0.292
4,20180105,0.818,0.602,0.994,0.308
5,20180106,0.618,0.522,0.696,0.228
6,20180107,0.598,0.478,0.606,0.238
7,20180108,0.742,0.528,0.848,0.202
8,20180109,0.652,0.488,0.838,0.206
9,20180110,0.5,0.406,0.636,0.16


In [7]:
# val_df = pd.concat([train_df.iloc[730-CFG["X_RANGE"]:1097].reset_index().drop(columns='index'), train_df.iloc[1380:].reset_index().drop(columns='index')], axis = 0,
#                     ignore_index = True)# 20210101 ~ 20211231
# 위의 1096-CFG['X_RANGE']의 이유는 20210101을 예측하기 위해서는 이전 X_RANGE만큼의 일수가 필요하므로
# train_df = pd.concat([train_df.iloc[0:731].reset_index().drop(columns='index'), train_df.iloc[1096:].reset_index().drop(columns='index')], axis = 0,
#                     ignore_index = True)
val_df = train_df.iloc[1096-CFG["X_RANGE"]:].reset_index().drop(columns='index') # 20210101 ~ 20211231
# # 위의 1096-CFG['X_RANGE']의 이유는 20210101을 예측하기 위해서는 이전 X_RANGE만큼의 일수가 필요하므로
train_df = train_df.iloc[:1096].reset_index().drop(columns='index') # 20180101 ~ 20201231

In [8]:
def get_x_y_data(df, infer=False):
    # x_range : x일전까지의 데이터를 통해 예측
    x_data = []
    y_data = []
    for i in tqdm(range(CFG['X_RANGE'], len(df))):
        x_data.append(np.array(df.loc[i-CFG['X_RANGE']:i-1, ['년', '월', '일', '광진구', '동대문구', '성동구', '중랑구']]).astype(float))
        y_data.append(np.array(df.loc[i, ['광진구', '동대문구', '성동구', '중랑구']]).astype(float))
    if infer:
        return x_data
    else:
        return x_data, y_data

In [9]:
def make_ymd_feature(df):
    df['일시'] = df['일시'].astype(str)
    df['년'] = df['일시'].str[:4].astype(int)/1000.
    df['월'] = df['일시'].str[4:6].astype(int)/12.
    df['일'] = df['일시'].str[6:8].astype(int)/31.
    df = df[['년', '월', '일', '광진구', '동대문구', '성동구', '중랑구']]
    return df

In [10]:
train_df = make_ymd_feature(train_df)
val_df = make_ymd_feature(val_df)

In [11]:
train_x, train_y = get_x_y_data(train_df)

  0%|          | 0/1093 [00:00<?, ?it/s]

In [12]:
val_x, val_y = get_x_y_data(val_df)

  0%|          | 0/365 [00:00<?, ?it/s]

In [13]:
len(train_x)

1093

In [14]:
class CustomDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __getitem__(self, index):
        if self.Y is not None:
            return torch.Tensor(self.X[index]), torch.Tensor(self.Y[index])
        else:
            return torch.Tensor(self.X[index])

    def __len__(self):
        return len(self.X)

In [15]:
train_dataset = CustomDataset(train_x, train_y)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

val_dataset = CustomDataset(val_x, val_y)
val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

In [16]:
len(train_x[0][1])

7

In [17]:
# RNN parameters
layers = 3
input_size = 7
hidden_size = 128
len(train_x)

1093

In [18]:
# LSTM과 CNN모델을 결합한 ConvLSTM을 활용한다
# 이에 대해서는 추후에 구축해보기
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = layers
        # batch_size = batch_size, in_channels, out_channels
        # convolution망을 통과 시키도록 한다.
        # 구성을 왜 이렇게 했는지 다시 check할 것.
        # 우선 out_channels 수랑 lstm의 hidden_size와 맞춰준다
        # 왜 input_channel이 3인지
        self.conv = nn.Conv1d(in_channels = 3, out_channels = hidden_size, kernel_size = 1)
        self.lstm = nn.LSTM(input_size=input_size, hidden_size = hidden_size, num_layers = layers, batch_first=True)
        # lstm의 output이 multioutput_reg에 들어가게 된다
        self.multioutput_reg = nn.Sequential(
            nn.Linear(hidden_size, 128, bias = True), 
            nn.BatchNorm1d(128),
            nn.Softplus(),
        )
        self.fc = torch.nn.Linear(128, 4)
        torch.nn.init.xavier_uniform(self.fc.weight)
    def forward(self, x):
        x = self.conv(x)
        h0 = torch.zeros(self.num_layers, x.size()[0], self.hidden_size).to(device) # lstm의 output
        c0 = torch.zeros(self.num_layers, x.size()[0], self.hidden_size).to(device) # lstm의 hidden_state information
        hidden, _ = self.lstm(x, (h0, c0)) 
        output = self.multioutput_reg(hidden[:,-1,:])
        output = self.fc(output)
        return output

In [19]:
def train(model, optimizer, train_loader, val_loader, scheduler, device):
    model.to(device)
    criterion = nn.L1Loss().to(device)
    
    best_loss = 9999999
    best_model = None
    
    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        for X, Y in iter(train_loader):
            X = X.to(device)
            Y = Y.to(device)
            
            optimizer.zero_grad()
            
            output = model(X)
            loss = criterion(output, Y)
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
        
        _train_loss = np.mean(train_loss)
        
        val_mae = validation(model, val_loader, criterion, device)
        print(f'Epoch : [{epoch}] Train Loss : [{_train_loss:.5f}] Val MAE : [{val_mae:.5f}]')
        
        if scheduler is not None:
            scheduler.step(val_mae)
            
        if best_loss > val_mae:
            best_loss = val_mae
            best_model = model 
            print(epoch)
    return best_model

In [20]:
def validation(model, val_loader, criterion, device):
    model.eval()
    val_loss = []
    with torch.no_grad():
        for X, Y in iter(val_loader):
            X = X.to(device)
            Y = Y.to(device)
            
            output = model(X)
            
            loss = criterion(output, Y)
            
            val_loss.append(loss.item())
    
    _val_loss = np.mean(val_loss)
    return _val_loss    

In [21]:
model = Model()

In [22]:
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
# factor: val_loss가 향상되지 않고 정체되어 있으면 learnng_rate를 factor배 감소
# patience : 4 -> 4 epoch동안 val_loss가 향상되지 않으면 학습률을 factor만큼 감소
# 학습률 개선을 동적으로 지원한다.
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=5,threshold_mode='abs',min_lr=1e-9, verbose=True)
best_model = train(model, optimizer, train_loader, val_loader, scheduler, device) # 2018.01.01 ~ 2020.12.31 까지 데이터로 학습

Epoch : [1] Train Loss : [2.41119] Val MAE : [2.27507]
1
Epoch : [2] Train Loss : [1.17900] Val MAE : [1.59403]
2
Epoch : [3] Train Loss : [0.98875] Val MAE : [1.65710]
Epoch : [4] Train Loss : [0.95399] Val MAE : [2.59758]
Epoch : [5] Train Loss : [0.93443] Val MAE : [1.90326]
Epoch : [6] Train Loss : [0.91103] Val MAE : [2.20304]
Epoch : [7] Train Loss : [0.90244] Val MAE : [1.52378]
7
Epoch : [8] Train Loss : [0.87853] Val MAE : [1.68285]
Epoch : [9] Train Loss : [0.86978] Val MAE : [1.67353]
Epoch : [10] Train Loss : [0.90793] Val MAE : [1.60612]
Epoch 00010: reducing learning rate of group 0 to 5.0000e-03.
Epoch : [11] Train Loss : [0.87182] Val MAE : [1.47539]
11
Epoch : [12] Train Loss : [0.88104] Val MAE : [1.52184]
Epoch : [13] Train Loss : [0.84487] Val MAE : [1.47111]
13
Epoch : [14] Train Loss : [0.85718] Val MAE : [1.60772]
Epoch : [15] Train Loss : [0.84110] Val MAE : [1.45327]
15
Epoch : [16] Train Loss : [0.84252] Val MAE : [1.53970]
Epoch 00016: reducing learning rate 

In [23]:
torch.save(best_model, "model.pth")

In [24]:
new_model = torch.load("model.pth")

In [25]:
model = new_model

In [26]:
test_df = pd.read_csv('./sample_submission.csv')
test_df = make_ymd_feature(test_df)
test_df = pd.concat([val_df[CFG['X_RANGE']*(-1):], test_df]).reset_index().drop(columns='index')

In [27]:
# test
def inference(model, df, device):
    model.to(device)
    model.eval()
    for i in tqdm(range(CFG['X_RANGE'], len(df))):
        X = torch.Tensor(np.array(df.loc[i-CFG['X_RANGE']:i-1, ['년', '월', '일', '광진구', '동대문구', '성동구', '중랑구']]).astype(float)).unsqueeze(0)
        X = X.to(device)
        with torch.no_grad():
            model_pred = model(X)[0]
        
        model_pred = model_pred.cpu().numpy()
        df.loc[i, ['광진구', '동대문구', '성동구', '중랑구']] = model_pred
    return df.loc[CFG['X_RANGE']:, ['광진구', '동대문구', '성동구', '중랑구']].reset_index().drop(columns=['index'])

In [28]:
preds = inference(best_model, test_df, device)

  0%|          | 0/334 [00:00<?, ?it/s]

In [29]:
submit = pd.read_csv('./sample_submission.csv')
submit['광진구'] = preds['광진구'].round(3)
submit['동대문구'] = preds['동대문구'].round(3)
submit['성동구'] = preds['성동구'].round(3)
submit['중랑구'] = preds['중랑구'].round(3)

In [30]:
submit.to_csv('./submit.csv', index=False)

In [31]:
preds

Unnamed: 0,광진구,동대문구,성동구,중랑구
0,3.60497,2.873492,3.119942,2.07307
1,3.896139,2.806161,3.333431,2.006437
2,3.860773,2.710603,3.317329,1.925887
3,3.882995,2.610957,3.308363,1.858145
4,3.926699,2.535614,3.329509,1.80453
5,3.948055,2.471756,3.342642,1.756416
6,3.966238,2.413902,3.35403,1.714295
7,3.987298,2.366025,3.369178,1.679923
8,4.009151,2.327461,3.386809,1.652247
9,4.032196,2.296435,3.406332,1.630231


In [32]:
submit

Unnamed: 0,일시,광진구,동대문구,성동구,중랑구
0,20220101,3.605,2.873,3.12,2.073
1,20220102,3.896,2.806,3.333,2.006
2,20220103,3.861,2.711,3.317,1.926
3,20220104,3.883,2.611,3.308,1.858
4,20220105,3.927,2.536,3.33,1.805
5,20220106,3.948,2.472,3.343,1.756
6,20220107,3.966,2.414,3.354,1.714
7,20220108,3.987,2.366,3.369,1.68
8,20220109,4.009,2.327,3.387,1.652
9,20220110,4.032,2.296,3.406,1.63


In [None]:
# 모든 행이 표시되도록 max_columns의 수를 지정
pd.options.display.max_rows = 334
submit
# 3.870 2.904 3.467 2.189
# mae값 1.39까지 떨어짐. ㅅ

In [None]:
check_res = preds

In [None]:
np.unique(check_res)

In [None]:
train_df.iloc[1080:1300]

In [None]:
torch.zeros(2,2)

In [None]:
X.size()

In [None]:
check_res

In [None]:
val_df.iloc[:]

In [45]:
avg_time = [5, 10, 15]
exp = 60*10
rev = [[] for _ in range(len(avg_time))]
for i in range(500, 2100, 100):
    for j in range(len(avg_time)):
        out = exp // avg_time[j]
        x = out * i // 1000 * 240 * 30
        rev[j].append(x)

In [46]:
res = pd.DataFrame(rev)

In [48]:
res = res.transpose()

In [49]:
res

Unnamed: 0,0,1,2
0,432000,216000,144000
1,518400,259200,172800
2,604800,302400,201600
3,691200,345600,230400
4,777600,388800,259200
5,864000,432000,288000
6,950400,475200,316800
7,1036800,518400,345600
8,1123200,561600,374400
9,1209600,604800,403200


In [50]:
res.columns = ["5초", "10초", "15초"]

In [55]:
res.index = [str(i) + "명"  for i in range(500, 2100, 100)]

In [56]:
res

Unnamed: 0,5초,10초,15초
500명,432000,216000,144000
600명,518400,259200,172800
700명,604800,302400,201600
800명,691200,345600,230400
900명,777600,388800,259200
1000명,864000,432000,288000
1100명,950400,475200,316800
1200명,1036800,518400,345600
1300명,1123200,561600,374400
1400명,1209600,604800,403200
