# Environment

In [1]:
import random
import os
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm
import torch

from utils.tools import dotdict
from exp.exp_informer import Exp_Informer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def seed_everything(seed: int = 302):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore

In [3]:
seed_everything()

# Prediction with Informer

In [4]:
with open('loc_match.pkl', 'rb') as f:
    loc_match = pickle.load(f)

submit = list()

for pm_loc in loc_match:
    df_len = pd.read_csv('data/{}test.csv'.format(pm_loc))
    DAYS = len(df_len)
    
    args = dotdict()

    args.model = 'informer'

    args.data = 'custom'
    args.root_path = './data/'
    args.data_path = '{}train.csv'.format(pm_loc)
    args.features = 'M'
    args.target = 'PM2.5'
    args.freq = 'h'
    args.checkpoints = '/checkpoints/'

    args.seq_len = 24*2 # 2일 입력
    args.label_len = 24*2 
    args.pred_len = 24*3 # 3일 예측

    args.enc_in = 6 # feature 개수
    args.dec_in = 6
    args.c_out = 6
    args.d_model = 512
    args.n_heads = 8
    args.e_layers = 2
    args.d_layers = 1
    args.d_ff = 2048
    args.factor = 5
    args.padding = 0
    args.distil = True
    args.dropout = 0.05
    args.attn = 'prob'
    args.embed = 'timeF'
    args.activation = 'gelu'
    args.output_attention = True
    args.do_predict = True
    args.mix = True
    args.cols = ['기온', '풍향', '풍속', '강수량', '습도', 'PM2.5']
    args.num_workers = 0
    args.itr = 2
    args.train_epochs = 6
    args.batch_size = 32
    args.patience = 3
    args.learning_rate = 0.0001
    args.des = 'test'
    args.loss = 'mae'
    args.lradj = 'type1'
    args.use_amp = False
    args.inverse = True

    args.gpu = 0
    args.use_gpu = True 
    args.use_multi_gpu = False

    Exp = Exp_Informer

    setting = '{}_{}_ft{}_sl{}_ll{}_pl{}_dm{}_nh{}_el{}_dl{}_df{}_at{}_fc{}_eb{}_dt{}_mx{}_{}'.format(args.model, args.data, args.features, 
                    args.seq_len, args.label_len, args.pred_len,
                    args.d_model, args.n_heads, args.e_layers, args.d_layers, args.d_ff, args.attn, args.factor, 
                    args.embed, args.distil, args.mix, args.des)

    exp = Exp(args)

    print('>>>>>>> start training : {}>>>>>>>'.format(setting))
    exp.train(setting)

    n = 5
    while n <= DAYS/24-2:
        args.data_path = '{}input.csv'.format(pm_loc)

        Exp = Exp_Informer
        setting = '{}_{}_ft{}_sl{}_ll{}_pl{}_dm{}_nh{}_el{}_dl{}_df{}_at{}_fc{}_eb{}_dt{}_mx{}_{}'.format(args.model, args.data, args.features, 
                args.seq_len, args.label_len, args.pred_len,
                args.d_model, args.n_heads, args.e_layers, args.d_layers, args.d_ff, args.attn, args.factor, 
                args.embed, args.distil, args.mix, args.des)

        exp = Exp(args)

        print('>>>>>>> start predicting : {}>>>>>>>'.format(setting))
        exp.predict(setting, load=True)
        
        preds = np.load('results/{}/real_prediction.npy'.format(setting))
        preds = preds.reshape(72, 6).transpose()
        submit.append(preds[-1])
        
        df_train = pd.read_csv('data/{}train.csv'.format(pm_loc))
        df_test = pd.read_csv('data/{}test.csv'.format(pm_loc))
        
        df_test['기온'][24*(n-3):24*n] = preds[0]
        df_test['풍향'][24*(n-3):24*n] = preds[1]
        df_test['풍속'][24*(n-3):24*n] = preds[2]
        df_test['강수량'][24*(n-3):24*n] = preds[3]
        df_test['습도'][24*(n-3):24*n] = preds[4]
        df_test['PM2.5'][24*(n-3):24*n] = preds[5]
        df_test.to_csv('data/{}test.csv'.format(pm_loc))

        df_input = pd.concat([df_train, df_test[:24*n]], axis=0)
        df_input.to_csv('data/{}input.csv'.format(pm_loc))
        
        n += 5
        
    print('{}의 미세먼지 농도 예측 완료'.format(pm_loc))
    print('현재 submit list의 길이: {}'.format(len(submit)))
        

Use GPU: cuda:0
>>>>>>> start training : informer_custom_ftM_sl48_ll48_pl72_dm512_nh8_el2_dl1_df2048_atprob_fc5_ebtimeF_dtTrue_mxTrue_test>>>>>>>
train 24425
val 3437
test 6941
	iters: 100, epoch: 1 | loss: 0.0901906
	speed: 0.1382s/iter; left time: 618.9997s
	iters: 200, epoch: 1 | loss: 0.0798544
	speed: 0.0878s/iter; left time: 384.3679s
	iters: 300, epoch: 1 | loss: 0.0779420
	speed: 0.0874s/iter; left time: 373.9260s
	iters: 400, epoch: 1 | loss: 0.0792247
	speed: 0.0874s/iter; left time: 365.3958s
	iters: 500, epoch: 1 | loss: 0.0717297
	speed: 0.0878s/iter; left time: 357.9946s
	iters: 600, epoch: 1 | loss: 0.0774816
	speed: 0.0878s/iter; left time: 349.2181s
	iters: 700, epoch: 1 | loss: 0.0681644
	speed: 0.0878s/iter; left time: 340.5576s
Epoch: 1 cost time: 72.01441764831543
Epoch: 1, Steps: 763 | Train Loss: 0.0810388 Vali Loss: 0.0895959 Test Loss: 0.0735548
Validation loss decreased (inf --> 0.089596).  Saving model ...
Updating learning rate to 0.0001
	iters: 100, epoch: 

In [6]:
submit = np.array(submit).flatten()
submit.shape

(78336,)

In [7]:
submit_file = pd.read_csv('../answer_sample.csv')
submit_file['PM2.5'] = submit
print(submit_file)
submit_file.to_csv('answer_informer2.csv')

       연도           일시  측정소     PM2.5
0       4  01-03 00:00   공주  0.059269
1       4  01-03 01:00   공주  0.058087
2       4  01-03 02:00   공주  0.052829
3       4  01-03 03:00   공주  0.056648
4       4  01-03 04:00   공주  0.056360
...    ..          ...  ...       ...
78331   4  11-16 19:00  홍성읍  0.023447
78332   4  11-16 20:00  홍성읍  0.023654
78333   4  11-16 21:00  홍성읍  0.023486
78334   4  11-16 22:00  홍성읍  0.023020
78335   4  11-16 23:00  홍성읍  0.022022

[78336 rows x 4 columns]


# Public Score

- 1st: 11.54386454 <= 결측치 처리시 ffill만 진행하여 처음부터 NA 이던 결측치는 처리되지 않음
- 2nd: 13.17899375 <- 강수량의 결측치의 경우 0으로, 나머지 feature 들의 결측치의 경우 평균값으로 처리 후 진행