# PyTorch와 RNN, LSTM

# RNN 모델 구성

# torch.nn.RNN
torch.nn.RNN(args, *kwargs)
- input_size: Input 크기(feature 수)
- hidden_size: hidden state 크기
  - DNN의 hidden layer의 사이즈라고 생각하면 된다. 즉, 노드 수
- num_layers: 순환 레이서 수(Default: 1)
  - 레이어 수, Multi-layer로 LSTM을 구성할 수 있지만 4개 초과부터는 Gradient Vanishing 이슈가 있다.
- nonlinearity: 비선형 활성화함수 설정, 'tanh' 또는 'relu'(Default: 'tanh')
- bias: bias 값 활성화 여부 설정(Default: True)
- batch_first: True일 시, Output 사이즈를 (batch, seq, feature)로 출력
  - Default: (seq, batch, feature)
- dropout: 드롭아웃 비율 설정(Default: 0)
- bidirectional: True일 시, Bidirectional RNN 적용(Default: False)

#### Inputs: input, h_0 (tuple 형태)
- input: 입력 텐서 - (sequence_length, batch_size, input_size) # batch_first가 True면 (batch_size, sequence_length, input_size)
  - seqeunce_length: 노드를 얼마만큼 순환할지에 대한 값
- h_0: hidden stats의 초기값 텐서 - (num_layers * bidirections, batch_size, hidden_size) 형태
  - bidirectional이 True면 2, False면 1

#### Outputs: output, h_n (tuple 형태)
- output: 마지막 레이어의 출력 텐서 - (sequence_length, batch_size, bidrections * hidden_size)
  - bidirectional이 True면 2, Fasle면 1
- h_n: 마지막 hidden state 텐서 - (num_layers * bidirections, batch_size, hidden_size)
  - bidirectional이 True면 2, Fasle면 1

# Input 사이즈 및 하이퍼 파라미터 설정
28 dimention(가로) x 28 sequence(세로)

# input, sequence, 하이퍼 파라미터 설정

In [1]:
sequence_length = 28
feature_size = 28
hidden_size = 128
num_layers = 4
dropout_p = 0.2
output_size = 10
minibatch_size = 128

# LSTM 모델 구성

# torch.nn.LSTM
torch.nn.LSTM(args, *kwargs)
- input_size: Input 크기(feature 수)
- hidden_size: hidden state 크기
  - DNN의 hidden layer의 사이즈라고 생각하면 된다. 즉, 노드 수
- num_layers: 순환 레이서 수(Default: 1)
  - 레이어 수, Multi-layer로 LSTM을 구성할 수 있지만 4개 초과부터는 Gradient Vanishing 이슈가 있다.
- bias: bias 값 활성화 여부 설정(Default: True)
- batch_first: True일 시, Output 사이즈를 (batch, seq, feature)로 출력
  - Default: (seq, batch, feature)
- dropout: 드롭아웃 비율 설정(Default: 0)
- bidirectional: True일 시, Bidirectional LSTM 적용(Default: False)

#### Inputs: input, (h_0, c_0)
- input: (sequence_length, batch_size, input_size) # batch_first가 True면 (batch_size, sequence_length, input_size)
  - seqeunce_length: 노드를 얼마만큼 순환할지에 대한 값
- h_0: (num_layers * bidirections, batch_size, hidden_size)
  - bidirectional이 True면 2, False면 1
- c_0: (num_layers * bidirections, batch_size, hidden_size)
  - bidirectional이 True면 2, False면 1

#### Outputs: output, (h_n, c_n)
- output: (sequence_length, batch_size, bidrections * hidden_size)
  - bidirectional이 True면 2, Fasle면 1
- h_n: (num_layers * bidirections, batch_size, hidden_size)
  - bidirectional이 True면 2, Fasle면 1
- c_n: (num_layers * bidirections, batch_size, hidden_size)
  - bidirectional이 True면 2, Fasle면 1

# RNN/LSTM 모델 구현

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Subset
from torchvision import datasets, transforms
from sklearn.model_selection import train_test_split
import numpy as np
from copy import deepcopy

In [3]:
sequence_length = 28
feature_size = 28
hidden_size = 128
num_layers = 4
dropout_p = 0.2
output_size = 10
minibatch_size = 128

In [5]:
class Net(nn.Module):
    def __init__(self, feature_size, hidden_size, num_layers, dropout_p, output_size, model_type):
        super().__init__()
        if model_type == 'rnn':
            self.sequenceclassfier = nn.RNN(
                input_size = feature_size,
                hidden_size = hidden_size,
                num_layers = num_layers,
                batch_first = True,
                dropout = dropout_p,
                bidirectional = True
            )
        elif model_type == 'lstm':
            self.sequenceclassfier = nn.LSTM(
                input_size = feature_size,
                hidden_size = hidden_size,
                num_layers = num_layers,
                batch_first = True,
                dropout = dropout_p,
                bidirectional = True
            )

        self.layers = nn.Sequential(
            nn.LeakyReLU(0.1),
            nn.BatchNorm1d(hidden_size * 2),
            # rnn 및 lstm의 출력값은 (sequence_length, batch_size, bidrections * hidden_size)
            # bidirectional이 True이므로 hidden_size * 2
            nn.Linear(hidden_size * 2, output_size),
            nn.LogSoftmax(dim=-1)
        )

    def forward(self, x):
        # |x| = batch_first = True이므로 (batch_size, sequence_length, input_size)
        out, _ = self.sequenceclassfier(x)
        # output, h_n이므로, h_n은 사용안함
        # |out| = batch_first = True이로로 (batch_size, sequence_length, 2 * hidden_size)
        # bidirectional이 True면 bidirections는 2 * hidden_size
        out = out[:, -1]
        # out[:, -1]은 (batch_size, sequence_length, 2 * hidden_size)에서,
        # 전체 batch_size를 선택한다는 의미의 :,
        # sequence_length에는 28개의 순서가 있고 각 순서마다 2 * hidden_size만큼 있다.
        # 이중에서 최종값은 마지막 sequence_length의 2 * hidden_size이다.
        # |out| = (batch_size, 2 * hidden_size)
        y = self.layers(out)
        # |out| = (batch_size = 128, output_size = 10)
        return y

# 참고 코드: shape와 slicing 이해

In [6]:
import torch
import torch.nn as nn

data1 = torch.full((minibatch_size, sequence_length, 2 * hidden_size), 1) # 3D tensor 생성
data2 = data1[:, -1]
print(data1.shape, data2.shape)

data3 = torch.full((minibatch_size, 1, sequence_length, feature_size), 1) # 4D tensor 생성
data4 = data3.reshape(-1, sequence_length, feature_size)
print(data3.shape, data4.shape)

torch.Size([128, 28, 256]) torch.Size([128, 256])
torch.Size([128, 1, 28, 28]) torch.Size([128, 28, 28])


In [7]:
model = Net(feature_size, hidden_size, num_layers, dropout_p, output_size, 'rnn')
model

Net(
  (sequenceclassfier): RNN(28, 128, num_layers=4, batch_first=True, dropout=0.2, bidirectional=True)
  (layers): Sequential(
    (0): LeakyReLU(negative_slope=0.1)
    (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): Linear(in_features=256, out_features=10, bias=True)
    (3): LogSoftmax(dim=-1)
  )
)

# MNIST with LSTM

In [8]:
train_rawdata = datasets.MNIST(root = 'dataset_MNIST',
                               train = True, # True면 Train 데이터
                               download = True, # 데이터가 없으면 Download
                               transform = transforms.ToTensor()) # raw 포맷을 텐서로 바꿔줌
test_rawdata = datasets.MNIST(root = 'dataset_MNIST',
                               train = False, # False면 Test 데이터
                               download = True, # 데이터가 없으면 Download
                               transform = transforms.ToTensor()) # raw 포맷을 텐서로 바꿔줌
print('number of training data : ', len(train_rawdata))
print('number of test data : ', len(test_rawdata))

number of training data :  60000
number of test data :  10000


In [9]:
VALIDATION_RATE = 0.2
train_indices, val_indices, _, _ = train_test_split(
    range(len(train_rawdata)), # Train 데이터셋의 인덱스 번호 추출(0~59999)
    train_rawdata.targets, # y 정답 라벨
    stratify = train_rawdata.targets, # y 정답 라벨 균등분포
    test_size = VALIDATION_RATE # 여기선 Validation 데이터셋 비율
)

In [10]:
train_dataset = Subset(train_rawdata, train_indices)
validation_dataset = Subset(train_rawdata, val_indices)

In [11]:
print(len(train_dataset), len(validation_dataset), len(test_rawdata))

48000 12000 10000


In [12]:
BATCH_SIZE = 128
train_batchs = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
va_batchs = DataLoader(validation_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_batchs = DataLoader(test_rawdata, batch_size=BATCH_SIZE, shuffle=False)

# input, output, loss, optimizer 설정

In [13]:
loss_func = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters())

# Training & Validation

In [14]:
def train_model(model, early_stop, nb_epochs, progress_interval):
    train_losses, valid_losses, lowest_loss = list(), list(), np.inf

    for epoch in range(nb_epochs):
        train_loss, valid_loss = 0, 0

        # train model
        model.train() # prepare model for training
        for x_minibatch, y_minibatch in train_batchs:
            x_minibatch = x_minibatch.reshape(-1, sequence_length, feature_size)
            y_minibatch_pred = model(x_minibatch)
            loss = loss_func(y_minibatch_pred, y_minibatch)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        train_loss = train_loss / len(train_batchs)
        train_losses.append(train_loss)

        # validate model
        model.eval()
        with torch.no_grad():
            for x_minibatch, y_minibatch in va_batchs:
                x_minibatch = x_minibatch.reshape(-1, sequence_length, feature_size)
                y_minibatch_pred = model(x_minibatch)
                loss = loss_func(y_minibatch_pred, y_minibatch)
                valid_loss += loss.item()
    
        valid_loss = valid_loss / len(va_batchs)
        valid_losses.append(valid_loss)
    
        if valid_losses[-1] < lowest_loss:
            lowest_loss = valid_losses[-1]
            lowest_epoch = epoch
            best_model = deepcopy(model.state_dict())
        else:
            if (early_stop > 0) and lowest_epoch + early_stop < epoch:
                print("Early Stopped", epoch, "epochs")
                break
    
        if (epoch % progress_interval) == 0:
            print(train_losses[-1], valid_losses[-1], lowest_loss, lowest_epoch, epoch)

    model.load_state_dict(best_model)
    return model, lowest_loss, train_losses, valid_losses

# 훈련 실행

In [15]:
nb_epochs = 30
progress_interval = 3
early_stop = 5

model, lowest_loss, train_losses, valid_losses = train_model(model, early_stop, nb_epochs, progress_interval)

0.560015678524971 0.3442645991577747 0.3442645991577747 0 0
0.154399822473526 0.15432094045458955 0.15432094045458955 3 3
0.12376958649853866 0.13316993526321777 0.10539618796332086 5 6
0.11104683033873637 0.10592339677300225 0.09711978968946224 7 9
0.0937984434440732 0.1223241393355296 0.09711978968946224 7 12
Early Stopped 13 epochs


# 테스트셋 기반 Evaluation

In [18]:
test_loss = 0
correct = 0
wrong_samples, wrong_preds, actual_preds = list(), list(), list()

model.eval()
with torch.no_grad():
    for x_minibatch, y_minibatch in test_batchs:
        x_minibatch = x_minibatch.reshape(-1, sequence_length, feature_size)
        y_test_pred = model(x_minibatch)
        test_loss += loss_func(y_test_pred, y_minibatch)
        pred = torch.argmax(y_test_pred, dim=1)
        correct += pred.eq(y_minibatch).sum().item()

        wrong_idx = pred.ne(y_minibatch).nonzero()[:, 0].numpy().tolist()
        for index in wrong_idx:
            wrong_samples.append(x_minibatch[index])
            wrong_preds.append(pred[index])
            actual_preds.append(y_minibatch[index])

test_loss /= len(test_batchs.dataset)
print('Average Test Loss: {:.4f}'.format(test_loss))
print('Accuracy: {}/{} ({:.2f}%)'.format(correct, len(test_batchs.dataset), 100*correct/len(test_batchs.dataset)))

Average Test Loss: 0.0007
Accuracy: 9726/10000 (97.26%)
