In [1]:
# 라이브러리 임포트
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
from torch import autograd
from torch.utils import data
from torch.optim import Adam
from torchvision import datasets, transforms
import torchvision.models as models

In [2]:
# 난수 생성기가 항상 일정한 값을 출력하게 하기 위해 seed 고정
random_seed = 2021
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
np.random.seed(random_seed)

## EDA

In [3]:
DATASET_PATH = os.path.join('/USER/team_project2/data')

In [4]:
df = pd.read_csv(os.path.join(DATASET_PATH, 'train.csv'))
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3279 entries, 0 to 3278
Data columns (total 37 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   날짜      3279 non-null   int64
 1   시간      3279 non-null   int64
 2   10      3279 non-null   int64
 3   100     3279 non-null   int64
 4   101     3279 non-null   int64
 5   120     3279 non-null   int64
 6   121     3279 non-null   int64
 7   140     3279 non-null   int64
 8   150     3279 non-null   int64
 9   160     3279 non-null   int64
 10  200     3279 non-null   int64
 11  201     3279 non-null   int64
 12  251     3279 non-null   int64
 13  270     3279 non-null   int64
 14  300     3279 non-null   int64
 15  301     3279 non-null   int64
 16  351     3279 non-null   int64
 17  352     3279 non-null   int64
 18  370     3279 non-null   int64
 19  400     3279 non-null   int64
 20  450     3279 non-null   int64
 21  500     3279 non-null   int64
 22  550     3279 non-null   int64
 23  600     3279 

날짜      0
시간      0
10      0
100     0
101     0
120     0
121     0
140     0
150     0
160     0
200     0
201     0
251     0
270     0
300     0
301     0
351     0
352     0
370     0
400     0
450     0
500     0
550     0
600     0
650     0
652     0
1000    0
1020    0
1040    0
1100    0
1200    0
1510    0
2510    0
3000    0
4510    0
5510    0
6000    0
dtype: int64

In [5]:
df['날짜'] = df['날짜'].astype('str')
df['날짜']=pd.to_datetime(df['날짜'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3279 entries, 0 to 3278
Data columns (total 37 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   날짜      3279 non-null   datetime64[ns]
 1   시간      3279 non-null   int64         
 2   10      3279 non-null   int64         
 3   100     3279 non-null   int64         
 4   101     3279 non-null   int64         
 5   120     3279 non-null   int64         
 6   121     3279 non-null   int64         
 7   140     3279 non-null   int64         
 8   150     3279 non-null   int64         
 9   160     3279 non-null   int64         
 10  200     3279 non-null   int64         
 11  201     3279 non-null   int64         
 12  251     3279 non-null   int64         
 13  270     3279 non-null   int64         
 14  300     3279 non-null   int64         
 15  301     3279 non-null   int64         
 16  351     3279 non-null   int64         
 17  352     3279 non-null   int64         
 18  370     

## 1월 25일 EDA 

In [6]:
df[df["날짜"]=='2020-01-25']

Unnamed: 0,날짜,시간,10,100,101,120,121,140,150,160,...,1020,1040,1100,1200,1510,2510,3000,4510,5510,6000
576,2020-01-25,0,78918,15252,2015,3481,1559,1271,41370,1096,...,946,2348,6568,4042,2280,2949,435,2384,1378,2419
577,2020-01-25,1,56988,9845,1303,2563,937,918,28828,727,...,566,1505,3786,2490,1900,1880,246,1492,723,1322
578,2020-01-25,2,45033,6968,1118,1832,628,560,23713,482,...,344,1009,2901,1857,1499,1562,152,1014,519,1131
579,2020-01-25,3,48757,6851,978,1225,586,547,24618,485,...,365,1069,2922,1496,1480,1589,174,940,566,1104
580,2020-01-25,4,82117,11056,1321,1965,605,620,39464,924,...,518,1493,3880,1809,1971,2207,280,1225,727,1850
581,2020-01-25,5,163837,32668,2194,4148,1289,1265,82869,2475,...,1440,4630,7980,3233,3632,5512,829,2959,2469,5354
582,2020-01-25,6,290499,80941,4092,11037,3533,2830,124766,6671,...,7728,12317,17637,8931,5568,12662,2683,10154,7640,15221
583,2020-01-25,7,392142,115097,6873,23348,5609,4158,163012,10105,...,13001,20546,35230,17090,7936,20526,4952,22753,12833,24072
584,2020-01-25,8,407343,109184,10162,29993,8688,5427,174557,8760,...,13387,17841,39060,18696,10172,22841,5278,26899,11739,22605
585,2020-01-25,9,436012,126653,18920,35787,14836,9364,228456,8198,...,12104,16513,35720,17151,14885,30707,6390,20802,10447,21886


In [9]:
mean_pd = []
for i in range(24):
    zero = df[df['시간']==int(i)]
    mean = zero.mean().astype('int')
    mean_pd.append(mean)
mean_df = pd.DataFrame(mean_pd)
mean_df # train data 도로별 시간대별 평균 dataframe

  mean = zero.mean().astype('int')


Unnamed: 0,시간,10,100,101,120,121,140,150,160,200,...,1020,1040,1100,1200,1510,2510,3000,4510,5510,6000
0,0,80173,14637,1191,3398,1061,940,29371,986,2293,...,788,2785,7360,5138,1285,3703,487,2171,1538,2276
1,1,56958,9971,832,2504,693,696,20529,675,1752,...,506,1902,4772,3432,1048,2752,344,1497,1077,1644
2,2,45276,8235,720,2136,496,581,15929,515,1504,...,375,1490,3715,2521,879,2331,284,1235,902,1384
3,3,44243,9517,754,2271,479,590,15785,523,1558,...,364,1545,3823,2271,871,2392,279,1407,912,1631
4,4,60451,15585,1103,3506,702,778,23621,811,1994,...,547,2368,6315,3254,1102,3204,433,2276,1450,2682
5,5,113564,28735,2178,5852,1787,1353,50237,1635,3422,...,1300,5398,16769,8857,1622,5214,806,4621,3200,5715
6,6,198701,52821,4494,9972,4753,2598,81141,3147,6295,...,3642,11239,30466,15553,2458,8905,1696,11140,6775,11762
7,7,266732,80249,7204,15757,8476,4533,103681,4907,10043,...,7535,15056,38800,19477,3433,15621,3510,19508,10979,18907
8,8,286127,84341,8747,20428,9633,5673,115630,5710,11888,...,7345,14841,37783,19289,4463,18831,4287,19047,11475,19187
9,9,297369,90012,9778,23424,9208,6155,130898,6020,14400,...,7024,14773,36863,18192,5875,19282,3279,16985,11164,19415


In [10]:
mean_df.insert(0, '날짜', '2020-01-25')
mean_df['날짜']=pd.to_datetime(mean_df['날짜'])

In [11]:
mean_df

Unnamed: 0,날짜,시간,10,100,101,120,121,140,150,160,...,1020,1040,1100,1200,1510,2510,3000,4510,5510,6000
0,2020-01-25,0,80173,14637,1191,3398,1061,940,29371,986,...,788,2785,7360,5138,1285,3703,487,2171,1538,2276
1,2020-01-25,1,56958,9971,832,2504,693,696,20529,675,...,506,1902,4772,3432,1048,2752,344,1497,1077,1644
2,2020-01-25,2,45276,8235,720,2136,496,581,15929,515,...,375,1490,3715,2521,879,2331,284,1235,902,1384
3,2020-01-25,3,44243,9517,754,2271,479,590,15785,523,...,364,1545,3823,2271,871,2392,279,1407,912,1631
4,2020-01-25,4,60451,15585,1103,3506,702,778,23621,811,...,547,2368,6315,3254,1102,3204,433,2276,1450,2682
5,2020-01-25,5,113564,28735,2178,5852,1787,1353,50237,1635,...,1300,5398,16769,8857,1622,5214,806,4621,3200,5715
6,2020-01-25,6,198701,52821,4494,9972,4753,2598,81141,3147,...,3642,11239,30466,15553,2458,8905,1696,11140,6775,11762
7,2020-01-25,7,266732,80249,7204,15757,8476,4533,103681,4907,...,7535,15056,38800,19477,3433,15621,3510,19508,10979,18907
8,2020-01-25,8,286127,84341,8747,20428,9633,5673,115630,5710,...,7345,14841,37783,19289,4463,18831,4287,19047,11475,19187
9,2020-01-25,9,297369,90012,9778,23424,9208,6155,130898,6020,...,7024,14773,36863,18192,5875,19282,3279,16985,11164,19415


In [12]:
for i in range(24):
    df.loc[576+i] = mean_df.loc[i]

In [13]:
df[df["날짜"]=='2020-01-25']

Unnamed: 0,날짜,시간,10,100,101,120,121,140,150,160,...,1020,1040,1100,1200,1510,2510,3000,4510,5510,6000
576,2020-01-25,0,80173,14637,1191,3398,1061,940,29371,986,...,788,2785,7360,5138,1285,3703,487,2171,1538,2276
577,2020-01-25,1,56958,9971,832,2504,693,696,20529,675,...,506,1902,4772,3432,1048,2752,344,1497,1077,1644
578,2020-01-25,2,45276,8235,720,2136,496,581,15929,515,...,375,1490,3715,2521,879,2331,284,1235,902,1384
579,2020-01-25,3,44243,9517,754,2271,479,590,15785,523,...,364,1545,3823,2271,871,2392,279,1407,912,1631
580,2020-01-25,4,60451,15585,1103,3506,702,778,23621,811,...,547,2368,6315,3254,1102,3204,433,2276,1450,2682
581,2020-01-25,5,113564,28735,2178,5852,1787,1353,50237,1635,...,1300,5398,16769,8857,1622,5214,806,4621,3200,5715
582,2020-01-25,6,198701,52821,4494,9972,4753,2598,81141,3147,...,3642,11239,30466,15553,2458,8905,1696,11140,6775,11762
583,2020-01-25,7,266732,80249,7204,15757,8476,4533,103681,4907,...,7535,15056,38800,19477,3433,15621,3510,19508,10979,18907
584,2020-01-25,8,286127,84341,8747,20428,9633,5673,115630,5710,...,7345,14841,37783,19289,4463,18831,4287,19047,11475,19187
585,2020-01-25,9,297369,90012,9778,23424,9208,6155,130898,6020,...,7024,14773,36863,18192,5875,19282,3279,16985,11164,19415


In [13]:
## 3월 30일 EDA

In [14]:
df[df["날짜"]=='2020-03-30']

Unnamed: 0,날짜,시간,10,100,101,120,121,140,150,160,...,1020,1040,1100,1200,1510,2510,3000,4510,5510,6000
2125,2020-03-30,0,38243,7403,585,1695,426,351,13476,430,...,330,1257,2574,2011,521,1662,139,950,761,1130
2126,2020-03-30,1,20378,3927,313,1060,315,223,7095,287,...,149,746,1220,1040,348,980,83,602,358,685


In [19]:
df = df.drop(index=2125)

In [21]:
df = df.drop(index=2126)

In [23]:
df

Unnamed: 0,날짜,시간,10,100,101,120,121,140,150,160,...,1020,1040,1100,1200,1510,2510,3000,4510,5510,6000
0,2020-01-01,0,83247,19128,2611,5161,1588,892,32263,1636,...,1311,3482,11299,7072,1176,3810,748,3920,2133,3799
1,2020-01-01,1,89309,19027,3337,5502,1650,1043,35609,1644,...,1162,3849,13180,8771,1283,3763,782,3483,2057,4010
2,2020-01-01,2,66611,14710,2970,4631,1044,921,26821,1104,...,768,2299,7986,5426,1536,3229,491,2634,1526,3388
3,2020-01-01,3,53290,13753,2270,4242,1021,790,21322,909,...,632,1716,5703,3156,1104,2882,431,2488,1268,3686
4,2020-01-01,4,52095,17615,2406,3689,1840,922,22711,1354,...,875,2421,5816,2933,1206,2433,499,2952,1927,5608
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3274,2020-05-17,19,311727,101285,10085,30637,10060,8749,148935,6801,...,6726,15431,25597,14292,9300,22238,3786,16936,10729,20194
3275,2020-05-17,20,305354,91426,8607,26021,8095,7198,136503,6147,...,5501,15378,24661,14747,8239,20604,3203,15018,9767,17962
3276,2020-05-17,21,306008,75113,6325,19933,5711,4494,129412,5134,...,4216,12558,22781,14081,6392,17937,2447,12403,7825,14031
3277,2020-05-17,22,237447,49498,4209,12145,3891,2718,96698,3526,...,2578,8870,16640,11066,4427,11955,1495,7507,5387,8889


In [32]:
df.to_csv('eda_train.csv', index=False)

In [40]:
pd.read_csv(os.path.join(DATASET_PATH, 'train.csv'))

Unnamed: 0,날짜,시간,10,100,101,120,121,140,150,160,...,1020,1040,1100,1200,1510,2510,3000,4510,5510,6000
0,20200101,0,83247,19128,2611,5161,1588,892,32263,1636,...,1311,3482,11299,7072,1176,3810,748,3920,2133,3799
1,20200101,1,89309,19027,3337,5502,1650,1043,35609,1644,...,1162,3849,13180,8771,1283,3763,782,3483,2057,4010
2,20200101,2,66611,14710,2970,4631,1044,921,26821,1104,...,768,2299,7986,5426,1536,3229,491,2634,1526,3388
3,20200101,3,53290,13753,2270,4242,1021,790,21322,909,...,632,1716,5703,3156,1104,2882,431,2488,1268,3686
4,20200101,4,52095,17615,2406,3689,1840,922,22711,1354,...,875,2421,5816,2933,1206,2433,499,2952,1927,5608
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3274,20200517,19,311727,101285,10085,30637,10060,8749,148935,6801,...,6726,15431,25597,14292,9300,22238,3786,16936,10729,20194
3275,20200517,20,305354,91426,8607,26021,8095,7198,136503,6147,...,5501,15378,24661,14747,8239,20604,3203,15018,9767,17962
3276,20200517,21,306008,75113,6325,19933,5711,4494,129412,5134,...,4216,12558,22781,14081,6392,17937,2447,12403,7825,14031
3277,20200517,22,237447,49498,4209,12145,3891,2718,96698,3526,...,2578,8870,16640,11066,4427,11955,1495,7507,5387,8889


In [14]:
pd.read_csv(os.path.join(DATASET_PATH, 'validate.csv'))

Unnamed: 0,날짜,시간,10,100,101,120,121,140,150,160,...,1020,1040,1100,1200,1510,2510,3000,4510,5510,6000
0,20200511,0,77968,14429,1233,4021,981,881,28672,1064,...,637,2604,5239,4168,1155,3596,337,2262,1608,2337
1,20200511,1,48679,9136,823,2618,654,572,17722,672,...,353,1870,3359,2558,1002,2157,257,1425,1018,1810
2,20200511,2,33773,8199,578,2188,392,502,14464,579,...,345,1499,2646,2022,876,1959,232,1155,927,1530
3,20200511,3,41511,9986,726,2817,555,646,17793,650,...,390,1730,3398,1967,912,2462,281,1477,959,1882
4,20200511,4,78680,19509,1463,4720,825,1088,35125,997,...,679,2958,7369,4120,1569,4568,577,3155,1871,3656
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331,20200524,19,314226,98345,10625,28618,8316,6684,141675,6619,...,8254,16118,23304,14082,8447,21694,2180,15746,10903,21014
332,20200524,20,300001,87871,8226,22706,6981,5743,142933,6295,...,5225,15297,21919,14526,7332,19732,1990,14096,10028,17787
333,20200524,21,304150,71126,6002,18317,4939,3779,133110,4781,...,4072,12685,21135,14403,5443,16967,1359,11670,7963,14041
334,20200524,22,236751,44947,3575,11455,3135,2536,98582,3267,...,2489,8093,14427,10914,3861,11397,859,7270,5194,8230


In [36]:
df1 = pd.read_csv(os.path.join(DATASET_PATH, 'eda_train.csv'))

In [37]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3279 entries, 0 to 3278
Data columns (total 37 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   날짜      3279 non-null   int64
 1   시간      3279 non-null   int64
 2   10      3279 non-null   int64
 3   100     3279 non-null   int64
 4   101     3279 non-null   int64
 5   120     3279 non-null   int64
 6   121     3279 non-null   int64
 7   140     3279 non-null   int64
 8   150     3279 non-null   int64
 9   160     3279 non-null   int64
 10  200     3279 non-null   int64
 11  201     3279 non-null   int64
 12  251     3279 non-null   int64
 13  270     3279 non-null   int64
 14  300     3279 non-null   int64
 15  301     3279 non-null   int64
 16  351     3279 non-null   int64
 17  352     3279 non-null   int64
 18  370     3279 non-null   int64
 19  400     3279 non-null   int64
 20  450     3279 non-null   int64
 21  500     3279 non-null   int64
 22  550     3279 non-null   int64
 23  600     3279 

## Dataloader
* 한 칼럼에 대한 7일(168행) 데이터를 input_data, 뒤따르는 7일 데이터를 output_data로 반환합니다.
* 도로별 차이를 두지 않고 모든 도로를 동일한 타입의 데이터로 취급합니다.
* 모든 csv 파일의 마지막 168행은 예측해야하는 값이므로 input으로 들어가지 않습니다.

In [41]:
class CustomDataset(data.Dataset):      # torch.utils.data.Dataset 클래스의 상속 클래스 CustomDataset class 생성. 상속 클래스 생성시 __init__, __getitem__, __len__함수는 기본적으로 정의해줘야 함.
    
    def __init__(self, root, seq_len, batch_size=64, phase='eda_train'):      # 데이터 로드 단계에 사용될 여러 변수들을 'self.변수명'의 형태로 지정해두는 함수
        
        self.root = root      # CustomDataset 객체 생성 시 데이터 경로 앞부분(공통 부분)을 root로 입력받아 저장
        self.phase = phase      # CustomDataset 객체 생성 시 데이터 경로 뒷부분(train/validate/test)을 phase로 입력받아 저장
        self.label_path = os.path.join(self.root, self.phase + '.csv')      # 데이터 전체 경로 생성
        df = pd.read_csv(self.label_path)      # 생성한 데이터 전체 경로로부터 데이터 로드
        
        self.seq_len = seq_len * 24      # 일 단위 기간을 입력 받은 후 시간 단위 기간으로 변환하여 저장
        self.batch_size = batch_size
        self.labels = {}
        
        timestamps = [(i, j) for (i, j) in zip(list(df['날짜']), list(df['시간']))]      # 날짜와 시간 정보가 튜플로 들어 있는 리스트 생성
        categories = df.columns.values.tolist()[2:]      # 도로명 column list 생성

        input_data = []
        output_data = []

        for t in range(len(timestamps)):
            temp_input_data = []
            temp_output_data = []
            for col in categories:
                road = df[col].tolist()
                inp = [float(i) for i in road[t:t+self.seq_len]]      # input 데이터 시계열 구간 설정
                outp = [float(j) for j in road[t+self.seq_len:t+2*self.seq_len]]      # output 데이터 시계열 구간 설정
                temp_input_data.append(inp) 
                temp_output_data.append(outp)
            input_data.append(temp_input_data)
            output_data.append(temp_output_data)
            
# input_data : [[첫번째 input 기간 동안의 첫번째 도로의 통행량 list, ..., 첫번째 input 기간 동안의 35번째 도로의 통행량 list], ...,
#               [마지막 input 기간 동안의 첫번째 도로의 통행량 list, ..., 마지막 input 기간 동안의 35번째 도로의 통행량 list]]
# output_data : [[첫번째 output 기간 동안의 첫번째 도로의 통행량 list, ..., 첫번째 output 기간 동안의 35번째 도로의 통행량 list], ...,
#                [마지막 output 기간 동안의 첫번째 도로의 통행량 list, ..., 마지막 output 기간 동안의 35번째 도로의 통행량 list]]
        
        self.labels['timestamp'] = timestamps
        self.labels['category'] = categories
        self.labels['input'] = input_data
        self.labels['output'] = output_data

    def __getitem__(self, index):      # index를 가지고 데이터를 하나씩 불러올 수 있게 하는 함수

#         데이터 내 index가 부여되는 형태

#                 | road_1    road_2    ...  road_35
#                -------------------------------------
#         time_1  | index_0   index_1   ...  index_34
#         time_2  | index_35  index_36  ...  index_69

        row = index // 35      # index를 35(도로수)로 나눈 몫  ex) 71//35 -> 2
        col = index % 35      # index를 35(도로수)로 나눈 나머지  ex) 71%35 -> 1

        timestamp = self.labels['timestamp'][row]      # (날짜, 시간) 튜플이 들어있는 list에서 row번째 시점에 해당하는 튜플
        category = self.labels['category'][col]      # 도로명 column list에서 col번째 도로에 해당하는 element
        
        input_data = torch.tensor(self.labels['input'][row][col])      # input_data list에서, row번째 시점의 col번째 도로 교통량 정보

        if self.phase != 'test':
            output_data = torch.tensor(self.labels['output'][row][col])
        else:
            output_data = []

        return timestamp, category, (input_data, output_data)

    def __len__(self):      # getitem 함수를 통해 데이터를 불러오려면,전체 index 길이를 알아야 한다.
        return (len(self.labels['timestamp']) - (self.seq_len * 2) + 1) * 35      # 특정 시점이 아닌 특정 기간을 하나의 data 단위로 설정하면, 전체 샘플 수는 감소함을 반영 


def data_loader(root, phase='eda_train', batch_size=64, seq_len=7, drop_last=False):
    if phase == 'eda_train':
        shuffle = True
    else:
        shuffle = False

    dataset = CustomDataset(root, seq_len, batch_size, phase)
    dataloader = data.DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)

    return dataloader

## Model

In [42]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


class GRUNet(nn.Module):
    def __init__(self,
                 input_size=168,      # input 길이는 168시간(7일 X 24시간)
                 hidden_size=1024,
                 output_size=168,      # output 길이는 168시간(7일 X 24시간)
                 batch_size=64,
                 num_layers=3,
                 dropout=0,
                 batch_first=False):      # batch_first(default=False) : 배치 차원을 첫번째 차원으로 하여 데이터를 불러올 것인지 여부

        super(GRUNet, self).__init__()

        self.hidden_size = hidden_size
        
        ##### Layer 1
        self.gru1 = nn.GRU(input_size,
                             hidden_size,
                             dropout=0.2,
                             num_layers=num_layers)

        ##### Layer 2
        self.gru2 = nn.GRU(hidden_size, 
                             hidden_size,
                             dropout=0.2,
                             num_layers=num_layers)

        ##### Finalize
        self.linear = nn.Linear(hidden_size, 
                                output_size)
        
        self.activation = nn.LeakyReLU(0.2)

        
    def forward(self, x, h_in):

        h_in = nn.Parameter(h_in.type(dtype), requires_grad=True)      # gradient descent로 업데이트 되는(requires_grad=True), h_in 이라는 이름의 파라미터 생성 
              
        # Layer 1
        gru_out, (h_1) = self.gru1(x, (h_in))
        gru_out = self.activation(gru_out)

        # Layer2
        gru_out, (h_2) = self.gru2(gru_out, (h_1))
        gru_out = self.activation(gru_out)

        # Final
        predictions = self.linear(gru_out)
        
        return predictions, (h_2)

In [43]:
# 결과 파일과 모델 가중치 파일 저장을 위해 log 디렉토리 생성. 중요한 파일이 덮어씌워지지 않도록 주의
os.makedirs('log', exist_ok=True)


def save_model(model_name, model, optimizer):
    state = {
        'model': model.state_dict(),
        'optimizer': optimizer.state_dict()
    }
    torch.save(state, os.path.join('log', model_name + '.pth'))
    print('model saved\n')
    return os.path.join('log', model_name + '.pth')


def load_model(model_name, model, optimizer=None):
    state = torch.load(os.path.join(model_name))
    model.load_state_dict(state['model'])
    if optimizer is not None:
        optimizer.load_state_dict(state['optimizer'])
    print('model loaded')

## Hyperparameters

In [44]:
dtype = torch.float
model_name = 'sequential'

batch_size = 64
num_epochs = 5
val_epoch = 1
base_lr = 0.01
seq_len = 7

input_size = seq_len * 24
output_size = input_size
hidden_size = 1024
num_layers = 6

## Training Setting

In [45]:
# model
model = GRUNet(input_size=input_size,
                hidden_size=hidden_size,
                output_size=output_size,
                batch_size=batch_size,
                num_layers=num_layers)
model = model.to(device)

# loss function
criterion = nn.MSELoss()

# optimizer
optimizer = Adam(model.parameters(), lr=base_lr)      # optimizer로는 Adam이 가장 무난합니다. Adam을 쓰면 learning_rate를 따로 지정해주지 않아도 알아서 조정됩니다.

In [46]:
print(model)

GRUNet(
  (gru1): GRU(168, 1024, num_layers=6, dropout=0.2)
  (gru2): GRU(1024, 1024, num_layers=6, dropout=0.2)
  (linear): Linear(in_features=1024, out_features=168, bias=True)
  (activation): LeakyReLU(negative_slope=0.2)
)


In [47]:
# get data loader
train_dataloader = data_loader(root=DATASET_PATH,
                               phase='eda_train',
                               batch_size=batch_size,
                               seq_len=seq_len,
                               drop_last=True)

validate_dataloader = data_loader(root=DATASET_PATH,
                                  phase='validate',
                                  batch_size=1,
                                  seq_len=seq_len,
                                  drop_last=True)

## Train

In [50]:
train_batch_loss = 0.0
train_epoch_loss = 0.0

valid_epoch_loss = 0.0
valid_min_epoch_loss = np.inf

for epoch in range(num_epochs):

    model.train()      # 모델을 train mode로 전환. train mode일 때만 적용되어야 하는 drop out 등이 적용될 수 있게 하기 위함 

    for iter_, sample in enumerate(train_dataloader):      # enumerate 함수를 통해 train_dataloader에서 'batch의 index'와 'batch'를 순서대로 호출

        (h_in) = (torch.zeros(num_layers, batch_size, hidden_size, requires_grad=True).to(device))

        _, _, (input_data, output_data) = sample      # train_dataloader에서 불러온 sample은 [[날짜, 시간], [도로], [[input_data],[output_data]]]로 구성됨. 학습에는 [[input_data], [output_data]]만 사용
        
        input_data = input_data.unsqueeze(0).to(device)
        output_data = output_data.unsqueeze(0).to(device)

        pred, (h_in) = model(input_data, h_in)
        
        loss = criterion(pred, output_data)

        model.zero_grad()    # 파라미터 업데이트는 batch 단위로 이루어지고, 매 batch마다 gradient를 초기화해주어야 함 
        loss.backward()      # backpropagation
        optimizer.step()      # 파라미터 업데이트
        
        train_batch_loss += loss.item()
        train_epoch_loss += loss.item()

        if iter_ % 400 == 399:      # 400개의 batch마다 training Loss 출력
            print('Train Epoch: {:2} | Batch: {:4} | Loss: {:1.2f}'.format(epoch, iter_+1, train_batch_loss/400))
            train_batch_loss = 0
            
    train_epoch_loss = 0.0

    
    model.eval()      # 모델을 eval mode로 전환. eval mode에서 적용되면 안되는 drop out 등이 적용되지 않게 하기 위함

    with torch.no_grad():      # validation / test set에 대해서는 weight 및 bias의 update, 즉, gradient descent가 일어나지 않도록 no_grad()를 선언
        (h_in) = (torch.zeros(num_layers, 1, hidden_size, requires_grad=False).to(device))

        for iter_, sample in enumerate(validate_dataloader):      # enumerate 함수를 통해 validate_dataloader에서 'batch의 index'와 'batch'를 순서대로 호출

            _, _, (input_data, output_data) = sample      # validate_dataloader에서 불러온 sample은 [[날짜, 시간], [도로], [[input_data],[output_data]]]로 구성됨. validation에는 [[input_data], [output_data]]만 사용

            input_data = input_data.unsqueeze(0).to(device)
            output_data = output_data.unsqueeze(0).to(device)

            pred, (h_in) = model(input_data, h_in)
            loss = criterion(pred, output_data)
            valid_epoch_loss += loss.item()

        print('\nValid Epoch: {:2} | Loss: {:1.2f}'.format(epoch, valid_epoch_loss/len(validate_dataloader)))

        if valid_epoch_loss < valid_min_epoch_loss:
            save_model('best', model, optimizer)
            valid_min_epoch_loss = valid_epoch_loss

        valid_epoch_loss = 0.0

Train Epoch:  0 | Batch:  400 | Loss: 3470802932.64
Train Epoch:  0 | Batch:  800 | Loss: 3493427348.16
Train Epoch:  0 | Batch: 1200 | Loss: 3345491140.56
Train Epoch:  0 | Batch: 1600 | Loss: 3159013200.24

Valid Epoch:  0 | Loss: 3440334601.77
model saved

Train Epoch:  1 | Batch:  400 | Loss: 3301797406.24
Train Epoch:  1 | Batch:  800 | Loss: 3258711148.96
Train Epoch:  1 | Batch: 1200 | Loss: 3006377111.52
Train Epoch:  1 | Batch: 1600 | Loss: 2945751039.28

Valid Epoch:  1 | Loss: 3212638869.37
model saved

Train Epoch:  2 | Batch:  400 | Loss: 2991071858.56
Train Epoch:  2 | Batch:  800 | Loss: 2876412939.04
Train Epoch:  2 | Batch: 1200 | Loss: 3057506843.36
Train Epoch:  2 | Batch: 1600 | Loss: 2938861360.08

Valid Epoch:  2 | Loss: 3057958689.14
model saved

Train Epoch:  3 | Batch:  400 | Loss: 2903686828.48
Train Epoch:  3 | Batch:  800 | Loss: 2829990079.04
Train Epoch:  3 | Batch: 1200 | Loss: 2835070550.72
Train Epoch:  3 | Batch: 1600 | Loss: 2854894001.12

Valid Epoch

## Inference

In [51]:
dtype = torch.float
seq_len = 7

input_size = seq_len * 24
hidden_size = 1024
output_size = input_size
batch_size = 1
num_layers = 6

In [52]:
test_dataloader = data_loader(root=DATASET_PATH,
                              phase='test',
                              batch_size=batch_size,
                              seq_len=seq_len,
                              drop_last=True)

In [53]:
model = GRUNet(input_size=input_size,
                hidden_size=hidden_size,
                output_size=output_size,
                batch_size=batch_size,
                num_layers=num_layers)

# model
model_name = 'log/best.pth'

load_model(model_name, model)
model = model.to(device)

model loaded


In [54]:
submission_file_path = os.path.join(DATASET_PATH, 'sample_submission.csv')
submission_table = pd.read_csv(submission_file_path)

In [57]:
(h_in) = (torch.zeros(num_layers, 1, hidden_size, requires_grad=False).to(device))

for iter_, sample in enumerate(test_dataloader):

    timestamp, category, (input_data, output_data) = sample
    input_data = input_data.unsqueeze(0).to(device)

    pred, (h_in) = model(input_data, h_in)

    for i, (t, h) in enumerate(zip(timestamp[0], timestamp[1])):
        for cat, row in zip(category, pred[0]):
            cat = f'{cat}'
            submission_table[cat] = row.tolist()

In [58]:
submission_table.to_csv('gru prediction.csv', index=False)