### Attention 구현

* Attention()
    * AttentionLayer()
        * AttentionWeight()
        * WeightSum()




<img src="https://github.com/LeeHyeJin91/PapersWithCode/assets/43728746/d8ba99ae-c5a4-4a22-acc9-97df3e6d0ea2" width=800 hight=500 />

In [2]:
import numpy as np

In [3]:
class AttentionWeight:
    
    def __init__(self):
        
        self.params, self.grads = [], []
        self.softmax = SoftmaxLayer()
        self.cache = None
        
    def forward(self, hs, h):
        # input: hs(N, T, H), h(N, H)
        # output: a(N, T)
        
        N, T, H = hs.shape
        h = h.reshape(N, 1, H).repeat(T, axis=1) # (N, T, H)
        
        t = hs * h                               # (N, T, H)
        s = np.sum(t, axis=2)                    # (N, T)
        a = self.softmax.forward(s)              # (N, T)
        
        self.cache = (hs, h)
        
        return a
        
    def backward(self, da):
        # input: da(N, T)
        # output: dhs(N, T, H), dh(N, H)
        
        hs, h = self.cache
        N, T, H = hs.shape
        
        ds = self.softmax.backward(da)            # (N, T)
        dt = s.reshape(N, T, 1).repeat(H, axis=2) # (N, T, H)
        dhs = dt * h                              # (N, T, H)
        dh = dt * hs                              # (N, T, H)
        dh = np.sum(dh, axis=1)                   # (N, H)
        
        return dhs, dh

class SoftmaxLayer:
    def __init__(self):
        self.params, self.grads = [], []
        self.out = None

    def forward(self, x):
        x = x - x.max(axis=1, keepdims=True)
        x = np.exp(x)
        x /= x.sum(axis=1, keepdims=True)
        
        self.out = x
        return self.out

    def backward(self, dout):
        dx = self.out * dout
        sumdx = np.sum(dx, axis=1, keepdims=True)
        dx -= self.out * sumdx
        return dx


In [4]:
class WeightSum:
    
    def __init__(self):
        
        self.params, self.grads = [], []
        self.cache = None
        
    def forward(self, hs, a):
        # input: hs(N, T, H), a(N, T)
        # output: c(N, H)
        
        N, T, H = hs.shape
        a = a.reshape(N, T, 1).repeat(H, axis=2)  # (N, T, H)
        t = hs * a                                # (N, T, H)
        c = np.sum(t, axis=1)                     # (N, T, H)
        
        self.cache = (hs, a)
        
        return c
        
    def backward(self, dc):
        # input: dc(N, H)
        # output: dhs(N, T, H), da(N, T)
        
        hs, a = self.cache
        N, T, H = hs.shape
        
        dt = dc.reshape(N, 1, H).repeat(T, axis=1)  # (N, T, H)
        dhs = dt * a                                # (N, T, H)
        da = dt * hs                                # (N, T, H)
        da = np.sum(da, axis=2)                     # (N, T) 
    
        return dhs, da
        

In [5]:
class AttentionLayer:
    
    def __init__(self):
        
        self.params, self.grads = [], []
        self.attention_weight_layer = AttentionWeight()
        self.weight_sum_layer = WeightSum()
        self.attention_weight = None
        
    def forward(self, hs, h):
        # input: hs(N, T, H), h(N, H)
        # output: c(N, H)
        
        a = self.attention_weight_layer.forward(hs, h) # (N, T)
        c = self.weight_sum_layer.forward(hs, a)       # (N, H)
        self.attention_weight = a                      # (N, T)
        
        return c
        
    def backward(self, dc):
        # input: dc(N, H)
        # output: dhs(N, T, H), dh(N, H)
        
        dhs0, da = self.weight_sum.backward(dc)       # (N, T, H) (N, T)
        dhs1, dh = self.attention_weight.forward(da)  # (N, T, H) (N, H) 
        dhs = dhs0+dhs1                               # (N, T, H) 
        
        return dhs, dh
    
class Attention:
    
    def __init__(self):
        self.params, self.grads = [], []
        self.layers = []
        self.attention_weight = []
        
    def forward(self, hs_enc, hs_dec):
        # input: hs_enc(N, T1, H), hs_dec(N, T2, H)
        # output: cs(N, T2, H)
        
        N, T2, H = hs_dec.shape
        cs = np.empty((N, T2, H), dtype='f')
        
        for t in range(T2):
            layer = AttentionLayer()
            c = layer.forward(hs_enc, hs_dec[:, t, :])           # (N, H)
            cs[:, t, :] = c
            self.attention_weight.append(layer.attention_weight) # (T2, N, T1)
            self.layers.append(layer)
        
        return cs
        
    def backward(self, dcs):
        # input: dcs(N, T2, H)  
        # output: dhs_enc(N, T1, H), dhs_dec(N, T2, H)
        
        N, T2, H = dcs.shape
        dhs_enc = 0
        dhs_dec = np.empty((N, T2, H), dtype='f')
        
        for t in range(T2):
            layer = self.layers[t]
            dhs, dh = layer.backward(dcs[:, t, :]) # (N, T1, H) (N, H)
            
            dhs_enc += dhs
            dhs_dec[:, t, :] = dh

        return dhs_enc, dhs_dec
    

### Seq2Seq 구현 

     
* **AttentionSeq2Seq()**
    * Encoder()
        * Embedding()
        * LSTM()
    * Decoder()
        * Embedding()
        * LSTM() 
        * Attention()
        * Affine()

<img src="https://github.com/LeeHyeJin91/PapersWithCode/assets/43728746/62502786-4bed-4366-8ce3-f4a36a286760" width=900 hight=500 />

#### decoder 상세구조

<img src="https://github.com/LeeHyeJin91/PapersWithCode/assets/43728746/62b4f081-85b3-4935-a070-7527fd1ac1cc" width=500 hight=500 />

In [6]:
import numpy as np
from LSTM import Embedding, LSTM, Affine, Softmax, CEloss

In [7]:
class Encoder:
    
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.rand
        
        W = (rn(V, D)/100).astype('f')
        Wx = (rn(D, 4*H) / np.sqrt(D)).astype('f')
        Wh = (rn(H, 4*H) / np.sqrt(H)).astype('f')
        b = np.zeros(4*H).astype('f')
        
        self.embd = Embedding(W)
        self.lstm = LSTM(Wx, Wh, b, stateful=False)
        
        self.params = self.embd.params + self.lstm.params
        self.grads = self.embd.grads + self.lstm.grads
        
    def forward(self, x):
        # input: x   (N, T1)
        # output: hs (N, T1, H)
        
        xs = self.embd.forward(x) # (N, T1, D)
        hs = self.lstm.forward(x) # (N, T1, H)
    
        return hs
        
    def backward(self, dhs):
        # input: dhs (N, T1, H)
        # output: dout
    
        dxs = self.lstm.backward(dhs)  # (N, T1, H)
        dout = self.embd.backward(dxs) # (N, T1, D) 임베딩 업데이트 
        
        return dout
    

In [8]:
class Decoder:
    
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.rand
        
        W = (rn(V, D)/100).astype('f')
        Wx = (rn(D, 4*H) / np.sqrt(D)).astype('f')
        Wh = (rn(H, 4*H) / np.sqrt(H)).astype('f')
        b = np.zeros(4*H).astype('f')
        Wa = (rn(H, V) / np.sqrt(H)).astype('f')
        ba = np.zeros(V).astype('f')
        
        self.embd = Embedding(W)
        self.lstm = LSTM(Wx, Wh, b, stateful=True)
        self.attention = Attention()
        self.affine = Affine(Wa, ba)
        
        layers = [self.embd, self.lstm, self.attention, self.affine]
        self.params, self.grads = [], []
        for layer in [self.embd, self.lstm, self.attention, self.affine]:
            self.params += layer.params
            self.grads += layer.grads
     
    def forward(self, t, hs_enc):
        # input: t(N, T2), hs_enc(N, T1, H)
        # output: a(N, T2, V)
        
        h = hs_enc[:, -1, :]
        self.lstm.set_state(h)
        
        xs = self.embd.forward(t)                   # (N, T2, D)
        hs_dec = self.lstm.forward(xs)              # (N, T2, H)
        _cs = self.attention(hs_enc, hs_dec)        # (N, T2, H)
        
        cs = np.concatenate((_cs, hs_dec), axis=2)  # (N, T2, 2H)
        a = self.affine.forward(cs)                 # (N, T2, V)
        
        return a
        
    def backward(self, da):
        # input: da       (N, T2, V)
        # output: dhs_enc (N, T1, H)
        
        dcs = self.affine.backward(da)                   # (N, T2, 2H)
        N, T2, H2 = dcs.sahpe
        H = H2//2
        dcs0, dcs1 = dcs[:, :, :H], dcs[:, :, H:]        # (N, T2, H) (N, T2, H) 
        
        dhs_enc, dhs_dec = self.attention.backward(dcs0) # (N, T1, H) (N, T2, H) 
        dhs = dhs_dec + dcs1                             # (N, T2, H)
        dxs = self.lstm.backward(dhs)                    # (N, T2, D)
        dout = self.embd.backward(dxs)                   # 임베딩 업데이트
        
        # output
        dh = self.lstm.dh                                # (N, H)
        dhs_enc[:, -1] += dh                             # (N, T1, H)
    
        return dhs_enc
        

In [9]:
class AttentionSeq2seq:
    
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        
        V, D, H = vocab_size, wordvec_size, hidden_size
        self.encoder = Encoder(V, D, H)
        self.decoder = Decoder(V, D, H)
        self.softmax = Softmax()
        self.loss = Celoss()
        
        self.params = self.encoder.params + self.decoder.params
        self.grads = self.encoder.grads + self.decoder.grads
        
    def forward(self, x, t):
        # input: x(N, T1), t(N, T2)
        # output:loss(1, 1)
        
        decoder_x = t[:, :-1]
        decoder_t = t[:, 1:]
        
        hs_enc = self.encoder.forward(x)            # (N, T1, H)
        a = self.decoder.forward(decoder_x, hs_enc) # (N, T2, V)
        y = self.softmax.forward(a, decoder_t)      # (N, T2)
        loss = self.loss.forward(y)
        
        return loss
    
    def backward(self, dloss=1):
         
        dy = self.loss.backward(dloss)        # (N, T2)
        da = self.softmax.backward(dy)        # (N, T2, V)
        dhs_enc = self.decoder.backward(da)   # (N, T1, H)
        dout = self.encoder.backward(dhs_enc) # (N, T1, D)
        
        return dout
    