# Addictive attention(Bahdanau Attention)
- 1.Production encoder hidden state: encoder의 각 time step 마다의 hidden state
- 2.Calculating alignment score: decoder의 이전 hidden state와 encoder의 hidden state를 통해 alignment score를 계산
- 3.Softmaxing the alignment score: alignment score vector에 softmax 함수를 적용하여 normalized alignment score를 계산
- 4.Calculating the **context vector**: normalized alignment score와 encoder의 hidden state를 곱하여(multiply) 계산
- 5.Decoding output: context vector와 **이전** decoder output을 concatenate 하여 현재 decoder의 input을 만들고, 이전 decoder hidden state와 함께 인풋하여 new output을 만든다

# Reference
- https://blog.floydhub.com/attention-mechanism/

In [37]:
import torch
import torch.nn as nn

# Encoder

In [117]:
class EncoderLSTM(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, n_layers=1, drop_prob=0):
        super(EncoderLSTM, self).__init__()
        
        # Parameters
        self.vocab_size = vocab_size
        self.emb_size = emb_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        # Layers
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.lstm = nn.LSTM(input_size=emb_size, hidden_size=hidden_size, num_layers=n_layers, dropout=drop_prob, batch_first=True)
        
    def forward(self, inputs, hidden):
        # Embed input words
        embedded = self.embedding(inputs)                      # embedded: batch x seq_len x emb_size
        
        # Pass the embedded word vectors into LSTM and return all outputs
        output, (last_hidden, last_cell) = self.lstm(embedded) # output: batch x seq_len x hidden_size 
                                                               # last_hidden: num_layer x batch_size x hidden_size 
                                                               # last_cell: num_layer  x batch_size x hidden_size
        return output, (last_hidden, last_cell)
    
    def init_hidden(self, batch_size):
        return (torch.zeros(self.n_layers, batch_size, self.hidden_size),
                 torch.zeros(self.n_layers, batch_size, self.hidden_size))        

In [194]:
class EncoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1, drop_prob=0):
        super(EncoderLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.n_layers = n_layers

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, n_layers, dropout=drop_prob, batch_first=True)

    def forward(self, inputs, hidden):
        # Embed input words
        embedded = self.embedding(inputs)
        # Pass the embedded word vectors into LSTM and return all outputs
        output, hidden = self.lstm(embedded, hidden)
        return output, hidden

    def init_hidden(self, batch_size=1):
        return (torch.zeros(self.n_layers, batch_size, self.hidden_size),
                torch.zeros(self.n_layers, batch_size, self.hidden_size))

In [195]:
batch_size = 2
vocab_size = 10
emb_size = 5
hidden_size = 6

In [192]:
encoder = EncoderLSTM(vocab_size=vocab_size, emb_size=emb_size, hidden_size=hidden_size)

TypeError: __init__() got an unexpected keyword argument 'vocab_size'

In [193]:
hiddens = encoder.init_hidden(batch_size=batch_size)

NameError: name 'device' is not defined

In [203]:
input_size = 2
hidden_size = 3

In [204]:
encoder = EncoderLSTM(input_size, hidden_size)

In [205]:
hiddens = encoder.init_hidden()
hiddens

(tensor([[[0., 0., 0.]]]), tensor([[[0., 0., 0.]]]))

In [206]:
inputs = torch.tensor([[0, 1]])

In [207]:
encoder_outputs, hiddens = encoder(inputs, hiddens)

In [208]:
encoder_outputs.size()

torch.Size([1, 2, 3])

# BahdanauDecoder

In [178]:
class BahdanauDecoder(nn.Module):
    def __init__(self, emb_size, hidden_size, output_size, n_layers=1, drop_prob=0.1):
        super(BahdanauDecoder, self).__init__()
        
        # Parameters
        self.emb_size = emb_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.drop_prob = drop_prob
        
        # Layers
        self.embedding = nn.Embedding(self.output_size, self.emb_size)
        
        self.fc_hidden = nn.Linear(self.hidden_size, self.hidden_size, bias=False)  # 
        self.fc_encoder = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
        self.weight = nn.Parameter(torch.FloatTensor(1, self.hidden_size))
        self.attn_combine = nn.Linear(self.hidden_size*2, self.hidden_size)
        self.dropout = nn.Dropout(self.drop_prob)
        self.lstm = nn.LSTM(input_size=self.emb_size + self.hidden_size, hidden_size=self.hidden_size, batch_first=True)
        self.classifier = nn.Linear(self.hidden_size, self.output_size)
        
    def forward(self, inputs, hiddens, encoder_outputs):
        encoder_outputs = encoder_outputs.squeeze()
        
        # Embed input words
        embedded = self.embedding(inputs)         # embedded: batch x seq_len x embed_size
        embedded = embedded.view(1, -1)           # embedded: 1, batch x seq_len x embed_size
        embedded = self.dropout(embedded)         # embedded: 1, batch x seq_len x embed_size
        
        # Calculating alignment scores
        dec_fc = self.fc_hidden(hiddens[0])       # dec_fc: num_layers x batch x hidden_size
        enc_fc = self.fc_encoder(encoder_outputs) # enc_fc: batch x seq_len x hidden_size
        
        return dec_fc + enc_fc
        
#         return dec_fc, enc_fc

In [179]:
decoder = BahdanauDecoder(emb_size, hidden_size, output_size=vocab_size)

In [180]:
decoder(inputs, hiddens, encoder_outputs)[1].size()

RuntimeError: The size of tensor a (2) must match the size of tensor b (5) at non-singleton dimension 1