## LSTM 구조

<img src="https://github.com/LeeHyeJin91/PapersWithCode/assets/43728746/0cb19aef-8f61-41d3-a4ff-dc7cc8f67b75" width=500 hight=500 />

## LSTM 계산그래프

<img src="https://github.com/LeeHyeJin91/PapersWithCode/assets/43728746/f3eb1195-8db7-443d-a616-41d887cd2436" width=700 hight=500 />

<img src="https://github.com/LeeHyeJin91/PapersWithCode/assets/43728746/4a0f20c0-a791-4c6d-8944-68e920e8180c" width=700 hight=500 />

### Embedding

In [1]:
import numpy as np

In [2]:
class EmbeddingLayer:
    
    def __init__(self, W):
        self.params = [W] # (V, D)
        self.grads = [np.zeros_like(W)]
        self.idx = None
     
    def forward(self, x):
        # input: x (N, 1)
        
        W, = self.params
        self.idx = x
        
        return W[x] # (N, D)
        
    def backward(self, dx):
        # input: dx (N, D)
        
        dW, = self.grads
        dW[...] = 0
        np.add.at(dW, self.idx, dx) # dW self.idx 행에 dx더함 -> self.grads도 같이 바뀜
        
        return None

class Embedding:
    
    def __init__(self, W):
        
        self.params = [W] # (V, D)
        self.grads = [np.zeros_like(W)]
        self.W = W
        self.layers = []
        
    def forward(self, input_x):
        # input: input_x  (N, T)
        # output: x       (N, T, D)
        
        N, T = input_x.shape
        V, D = self.W.shape
        
        x = np.empty((N, T, D), dtype='f')
        for t in range(T):
            layer = EmbeddingLayer(self.W)
            x[:, t, :] = layer.forward(input_x[:, t]) # (N, D)
            self.layers.append(layer)
        
        return x
        
    def backward(self, dx):
        # input dx: (N, T, D)
        
        N, T, D = dx.shape
        
        grad = 0
        for t in range(T):
            layer = self.layers[t]
            layer.backward(dx[:, t, :])
            grad += layer.grads[0]    # (V, D)
        
        self.grads[0][...] = grad
         
        return None
    

### LSTM

In [3]:
class LSTMLayer:
    
    def __init__(self, Wx, Wh, b):
        
        self.params = [Wx, Wh, b]
        self.grads = [np.zeros_like(Wx),np.zeros_like(Wh), np.zeros_like(b)]
        self.cache = None
        
    def forward(self, x, h_prev, c_prev):
        
        Wx, Wh, b = self.params # (D, 4H), (H, 4H) (4H, )
        N, H = h_prev.shape
        A = np.matmul(x, Wx) + np.matmul(h_prev, Wh) + b # (N, 4H)
        
        sigmoid = lambda x: 1 / (1 + np.exp(-x))
        
        f = sigmoid(A[:, :H])
        g = np.tanh(A[:, H:2*H])
        i = sigmoid(A[:, 2*H:3*H])
        o = sigmoid(A[:, 3*H:])
        
        c_next = f * c_prev + i * g
        h_next = o * np.tanh(c_next)
        
        self.cache = (x, h_prev, c_prev, f, g, i, o, c_next)
        
        return h_next, c_next
        
    def backward(self, dh_next, dc_next):
        
        Wx, Wh, b = self.params
        x, h_prev, c_prev, f, g, i, o, c_next = self.cache
        
        tanh_c_next = np.tanh(c_next)
        ds = dc_next + (dh_next * o) * (1 - tanh_c_next**2)
        
        dc_prev = ds * f
        df = ds * c_prev * f * (1 - f)
        dg = ds * i * (1 - g**2)
        di = ds * g * i * (1 - i)
        do = dh_next * tanh_c_next * o * (1-o)
        dA = np.hstack([df, dg, di, do]) # (N, 4H)
        
        dWh = np.matmul(h_prev.T, dA)    # (H, N) (N, 4H) 
        dh_prev = np.matmul(dA, Wh.T)    # (N, 4H) (4H, H)
              
        dWx = np.matmul(x.T, dA)         # (D, N) (N, 4H)
        dx = np.matmul(dA, Wx.T)         # (N, 4H) (4H, D)
        db = np.sum(dA, axis=0)
        
        self.grads[0][...] = dWx
        self.grads[1][...] = dWh
        self.grads[2][...] = db

        return dx, dh_prev, dc_prev
        
    

In [4]:
class LSTM:
    
    def __init__(self, Wx, Wh, b, stateful=False):
        
        self.params = [Wx, Wh, b]
        self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)]
        
        self.stateful = stateful
        self.h, self.c = None, None
        self.dh = None
        self.layers = []
        
    def forward(self, xs):
        # input: xs  (N, T, D)
        # output: hs (N, T, H)
        
        Wx, Wh, b = self.params
        N, T, D = xs.shape
        H = Wh.shape[0]
        
        if self.stateful or self.h is None:
            self.h = np.zeros((N, H), dtype='f')
        if self.stateful or self.c is None:
            self.c = np.zeros((N, H), dtype='f')
        
        hs = np.empty((N, T, H), dtype='f')
        for t in range(T):
            layer = LSTMLayer(Wx, Wh, b)
            self.h, self.c = layer.forward(xs[:, t, :], self.h, self.c)
            hs[:, t, :] = self.h
            
            self.layers.append(layer)
        
        return hs
        
    def backward(self,dhs):
        # input: dhs  (N, T, H)
        # output: dxs (N, T, D)
        
        Wx, Wh, b = self.params
        N, T, H = dhs.shape
        D = Wx.shape[0]
    
        dxs = np.empty((N, T, D), dtype='f')
        dh, dc = 0, 0
        grads = [0, 0, 0]
    
        for t in reversed(range(T)):
            layer = self.layers[t]
            dx, dh, dc = layer.backward(dhs[:, t, :]+dh, dc)
            dxs[:, t, :] = dx
            
            for i, grad in enumerate(layer.grads):
                grads[i] += grad
        
        for i, grad in enumerate(grads):
            self.grads[i][...] = grad
        
        self.dh = dh
        
        return dxs
        
    def set_state(self, h, c):
        self.h = h
        self.c = c

    def reset_state(self):
        self.h = None
        self.c = None
    

### Affine

In [5]:
class Affine:
    
    def __init__(self, Wa, ba):
        
        self.params = [Wa, ba] # (H, V), (V, )
        self.grads = [np.zeros_like(Wa), np.zeros_like(ba)]
        self.cache = None
    
    def forward(self, h):
        # input: h  (N, T, H)
        # output: a (N, T, V)
        
        Wa, ba = self.params
        N, T, H = h.shape
        
        h = h.reshape(N*T, -1)     # (NT, H)
        a = np.matmul(h, Wa) + ba  # (NT, V)
        a = a.reshape(N,T, -1)     # (N, T, V)
        
        self.cache = h
        
        return a
    
    def backward(self, da):
        # input: da  (N, T, V)
        # output: dh (N, T, H)
        
        Wa, ba = self.params
        N, T, V = da.shape
        h = self.cache             # (NT, H)    
        da = da.reshape(N*T, -1)   # (NT, V)
        
        dWa = np.matmul(h.T, da)   # (H, NT) (NT, V)
        dba = np.sum(da, axis=0)   # (V, )
        dh = np.matmul(da, Wa.T)   # (NT, V) (V, H) 
        dh = dh.reshape(N, T, -1)  # (N, T, H)
    
        self.grads[0][...] = dWa
        self.grads[1][...] = dba
    
        return dh
    

### Softmax

In [6]:
class Softmax:
    
    def __init__(self):
        self.cache = None
        
    def forward(self, a, label):
        # input: a(N, T, V) label(N, T)
        # output: y(N, T)
        
        N, T, V = a.shape
        a = a.reshape(N*T, -1)                           # (NT, V)
        label = label.reshape(N*T)                       # (NT, )
        
        # softmax 계산
        a = a - a.max(axis=1, keepdims=True)
        a_exp = np.exp(a)
        a_stm = a_exp / a_exp.sum(axis=1, keepdims=True) # (NT, V)
        
        # 정답 label만 선택 
        y = a_stm[np.arange(N*T), label]                 # (NT, )
        self.cache = (y, label, a_stm)
        
        return y.reshape(N, T)
    
    def backward(self, dy):
        # input: dy  (N, T)
        # output: da (N, T, V)
        
        N, T = dy.shape
        dy = dy.reshape(N*T)                              # (NT, )
        y, label, a_stm = self.cache                      # (NT, )
        
        a_stm[np.arange(N*T), label] = dy * (y * (1 - y)) # (NT, V)
        a_stm = a_stm/(N*T)
        da = a_stm.reshape(N, T, -1)                      # (N, T, V)

        return da
    

### Cross Entrophy loss

In [7]:
class CEloss:
    
    def __init__(self):
        self.cache = None
        
    def forward(self, y):
        # input: y     (N, T)
        # output: loss (1, 1)
        
        N, T = y.shape
        _y = y.reshape(N*T) # (NT, )
        
        loss = -np.sum(np.log(_y))
        loss = loss/(N*T)
        self.cache = y
        
        return loss
    
    def backward(self, dloss=1):
        # input: dloss  
        # output: dy (N, T)

        y = self.cache      
        N, T = y.shape
        
        y = y.reshape(N*T)   # (NT, )
        dy = dloss * (-1/y) 
        
        return dy.reshape(N, T)
    

### LSTM model

In [9]:
class LSTMLM:
    
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn
        
        # 파라미터 초기화
        W = (rn(V, D) / 100).astype('f')
        Wx = (rn(D, 4*H) / np.sqrt(D)).astype('f')
        Wh = (rn(H, 4*H) / np.sqrt(H)).astype('f')
        b  = np.zeros(4*H).astype('f')
        Wa = (rn(H, V) / np.sqrt(H)).astype('f')
        ba = np.zeros(V).astype('f')
        
        # 계층생성
        self.layers = [Embedding(W),
                       LSTM(Wx, Wh, b, stateful=True),
                       Affine(Wa, ba)
                      ]
        self.softmax = Softmax()
        self.loss = CEloss()
        self.lstm_layer = self.layers[1]
        
        self.params, self.grads = [], []
        for layer in self.layers:
            self.params += layer.params
            self.grads += layer.grads

    def forward(self, x, label):
        
        for layer in self.layers:
            x = layer.forward(x)
        
        y = self.softmax.forward(x, label)
        loss = self.loss.forward(y)
        
        return loss
    
    def backward(self, dloss=1):
        
        dout = self.loss.backward(dloss)
        dout = self.softmax.backward(dout)
        
        for layer in reversed(self.layers):
            dout = layer.backward(dout)
        
        return None
    
    
    def reset_state(self):
        self.lstm_layer.reset_state()
        
    def save_params(self, file_name='lstm.pkl'):
        with open(file_name, 'wb') as f:
            pickle.dump(self.params, f)
            
    def load_params(self, file_name='lstm.pkl'):
        with open(file_name, 'rb') as f:
            self.params = pickle.load(f)
            