In [21]:
import numpy as np
import pandas as pd
import torch
from torch.autograd import Variable
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import Adam
import torch.nn as nn

device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
print(device)

PRED_DAYS = len(pd.read_csv('data/공주test.csv'))

cuda:0


# Regression with LSTM

In [38]:
class LSTM(nn.Module):

    def __init__(self, input_size, hidden_size, num_layers, output_size, device):

        super(LSTM, self).__init__()
        self.device = device
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.output_size = output_size

        self.lstm = nn.LSTM(input_size = input_size, 
                            hidden_size = hidden_size, 
                            num_layers = num_layers, 
                            batch_first = True)
                
        self.fc = nn.Sequential(nn.Dropout(0.3), 
                                nn.Linear(hidden_size, output_size)
                                )
        
    def reset_hidden_state(self):
        self.hidden = (Variable(torch.zeros(self.num_layers, self.hidden_size)).to(device), 
                        Variable(torch.zeros(self.num_layers, self.hidden_size)).to(device))
        
    def forward(self, x):

        h0 = torch.zeros(self.num_layers, self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, self.hidden_size).to(device)
        
        output, _ = self.lstm(x, (h0, c0))
        out = self.fc(output)

        return out

In [39]:
class Optimization:
    def __init__(self, flag, model, loss_fn, optimizer, learning_rate):
        self.flag = flag
        self.model = model
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.lr = learning_rate
        self.train_losses= []
        self.val_losses = []
        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, mode='min', factor=0.1, patience=10, threshold_mode='abs', min_lr=1e-8, verbose=True)
        
    def train_step(self, x, y):
        self.model.train()
                
        yhat = self.model(x)
        
        loss = self.loss_fn(y, yhat)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
            
        return loss.item()
    
    def train(self, train_loader, vali_loader, epochs=50):
        min_loss = 99999999999
        for epoch in range(1, epochs+1):
            batch_losses = []
            for i, (train_x, train_y) in enumerate(train_loader):
                self.model.reset_hidden_state()
                loss = self.train_step(train_x, train_y)
                batch_losses.append(loss)
            training_loss = np.mean(batch_losses)
            self.train_losses.append(training_loss)
                        
            with torch.no_grad():
                batch_val_losses = []
                for vali_x, vali_y in vali_loader:
                    self.model.eval()
                    yhat = self.model(vali_x)
                    val_loss = self.loss_fn(vali_y, yhat).item()
                    batch_val_losses.append(val_loss)
                validation_loss = np.mean(batch_val_losses)
                self.val_losses.append(validation_loss)

            print('EPOCH [{}/{}]: Training Loss: {}, Validation Loss: {}'.format(epoch, epochs, np.mean(training_loss), validation_loss))
            
            self.scheduler.step(validation_loss)
            if min_loss > validation_loss:
                min_loss = validation_loss
                torch.save(self.model.state_dict(), 'weights/{}_bestmodel.pth'.format(self.flag))
                print('MODEL SAVED')    
                                                                    
    def predict(self, test_loader):
        with torch.no_grad():
            preds = []
            for test_x in test_loader:
                self.model.eval()
                pred = self.model(test_x)
                pred = pred.detach().cpu().numpy()
                preds.append(pred)
        preds = np.array(preds)
        
        return preds.flatten()

In [48]:
submit = list()

for pm_loc in loc_match:
    # data 불러오기
    df = pd.read_csv('data/{}train.csv'.format(pm_loc))
    x = list()
    y = list()
    for index, row in df.iterrows():
        x.append(np.array(row[cols].values))
        y.append(np.array(row['PM2.5']))

    x = np.array(x)
    y = np.array(y).reshape(-1,1)
    LEN = x.shape[0]
    train_split = int(LEN*0.75)
    
    train_x = Variable(torch.Tensor(x[:train_split])).to(device)
    train_y = Variable(torch.Tensor(y[:train_split])).to(device)
    vali_x = Variable(torch.Tensor(x[train_split:])).to(device)
    vali_y = Variable(torch.Tensor(y[train_split:])).to(device)
    train = TensorDataset(train_x, train_y)
    vali = TensorDataset(vali_x, vali_y)
    train_loader = DataLoader(train, batch_size, shuffle=True)
    vali_loader = DataLoader(vali, batch_size, shuffle=False)

    # Hyperparameter
    input_size = 5
    hidden_size = 128
    num_layers = 1
    output_size = 1
    batch_size = 32

    learning_rate = 1e-05
    loss_fn = nn.MSELoss()
    
    lstm = LSTM(input_size, hidden_size, num_layers, output_size, device).to(device)
    optimizer = Adam(lstm.parameters(), lr=learning_rate, weight_decay=0.1)
    opt = Optimization('{}'.format(loc_map['{}'.format(pm_loc)]), lstm, loss_fn, optimizer, learning_rate)
    
    print('>>>>>>>>>>> {} 미세먼지 농도 훈련 시작 >>>>>>>>>>>'.format(pm_loc))
    opt.train(train_loader, vali_loader, epochs=50)
    
    PATH = 'weights/{}_bestmodel.pth'.format(loc_map[pm_loc])
    state_dict = torch.load(PATH)
    lstm = lstm.load_state_dict(state_dict)
    
    df = pd.read_csv('data/{}test.csv'.format(pm_loc))
    test_x = list()
    for index, row in df.iterrows():
        test_x.append(np.array(row[cols].values))
    test_x = np.array(test_x)
    test_x = Variable(torch.Tensor(test_x)).to(device)
    pred_loader = DataLoader(test_x, test_x.shape[0], shuffle=False)
    
    preds = opt.predict(pred_loader)
    print('>>>>>>>>>>> {} 미세먼지 농도 예측 완료 >>>>>>>>>>>'.format(pm_loc))
    
    n = 5
    while n <= PRED_DAYS/24-2:
        submit.append(preds[24*(n-3):24*n])
        n+=5

>>>>>>>>>>> 공주 미세먼지 농도 훈련 시작 >>>>>>>>>>>
EPOCH [1/50]: Training Loss: 0.024690177821307252, Validation Loss: 0.014413039624031607
MODEL SAVED
EPOCH [2/50]: Training Loss: 0.012395076920763293, Validation Loss: 0.009205375323423127
MODEL SAVED
EPOCH [3/50]: Training Loss: 0.009147752267297204, Validation Loss: 0.007600890557160797
MODEL SAVED
EPOCH [4/50]: Training Loss: 0.008003412788380345, Validation Loss: 0.00704442511119771
MODEL SAVED
EPOCH [5/50]: Training Loss: 0.007604070890688059, Validation Loss: 0.006805752833660847
MODEL SAVED
EPOCH [6/50]: Training Loss: 0.007376417514687934, Validation Loss: 0.0067178904866828605
MODEL SAVED
EPOCH [7/50]: Training Loss: 0.00729375638984316, Validation Loss: 0.006658175431773361
MODEL SAVED
EPOCH [8/50]: Training Loss: 0.007245564657102572, Validation Loss: 0.0066617420358527244
EPOCH [9/50]: Training Loss: 0.007246491122099179, Validation Loss: 0.006671365076972573
EPOCH [10/50]: Training Loss: 0.007239827181217136, Validation Loss: 0.006

# submit

In [None]:
import matplotlib.pyplot as plt
for i in range(17):
    plt.figure(figsize=(30, 6))
    plt.plot(submit[i])

In [50]:
submit = np.array(submit).flatten()

In [51]:
answer_ensemble = pd.read_csv('../answer_sample.csv')
answer_ensemble['PM2.5'] = submit
answer_ensemble

Unnamed: 0,연도,일시,측정소,PM2.5
0,4,01-03 00:00,공주,0.085710
1,4,01-03 01:00,공주,0.085710
2,4,01-03 02:00,공주,0.085710
3,4,01-03 03:00,공주,0.085710
4,4,01-03 04:00,공주,0.085710
...,...,...,...,...
78331,4,11-16 19:00,홍성읍,0.092140
78332,4,11-16 20:00,홍성읍,0.092141
78333,4,11-16 21:00,홍성읍,0.092141
78334,4,11-16 22:00,홍성읍,0.092142


In [52]:
answer_ensemble.to_csv('answer_NLinear_LSTM3.csv')

# Conclusion

- Public Score: 
    - 1st: 10.71912645 <- hidden_size = 32
    - 2nd: 10.81779746 <- 평균과 강수량의 경우 0으로 결측치 처리, hidden_size = 16
    - 3rd: 10.65019972 <- hidden_size=128, score는 가장 좋게 나왔으나, 예측한 미세먼지는 심정지..

- LSTM은 역시나 hyperparameter tuning이 중요하다.
- MAE Score가 가장 좋았던 LSTM 모델의 경우 예측한 미세먼지 농도가 거의 동일한 값으로 통일
- 그럼에도 불구하고 가장 점수가 높았던 이유는 훈련시 MSE를 가장 작게 하는 값으로 과도하게 적합한 탓인 것 같다.
- 후일 LSTM으로 프로젝트를 진행하는 경우가 생긴다면 충분한 시간을 두고 grid search를 통해 여러 hyperparmeter들의 조합의 성능을 탐색하고 선정하는 것이 중요한 것 같다.