In [None]:
import numpy as np

class VanillaRNN:
    def __init__(self, input_size, hidden_size, output_size, optimizer="GD"):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.optimizer = optimizer
        
        #initializing parameters
        self.W_xh = np.random.randn(self.hidden_size, self.input_size)
        self.W_hh = np.random.randn(self.hidden_size, self.hidden_size)
        self.W_hy = np.random.randn(self.output_size, self.hidden_size)
        self.b_h = np.zeros((self.hidden_size, 1))
        self.b_o = np.zeros((self.output_size, 1))
        self.hidden_state = np.zeros((self.hidden_size, 1))

    def forward(self, inputs):
        h = np.zeros((self.hidden_size, 1))
        y = np.zeros((self.input_size, self.output_size))

        for t in range(self.input_size):
            x = inputs[t].reshape((self.input_size, 1))

            h = self.tanh(np.dot(self.W_hh, h) + np.dot(self.W_xh, x) + self.b_h) 
            y[t] = (np.dot(self.W_hy, h) + self.b_o).reshape((self.output_size,))
            
        return y


    def backward(self, inputs, targets):
        dW_xh = np.zeros_like(self.W_xh)
        dW_hh = np.zeros_like(self.W_hh)
        dW_hy = np.zeros_like(self.W_hy)
        db_o = np.zeros_like(self.b_o)
        db_h = np.zeros_like(self.b_h)
        dhidden_next = np.zeros_like(self.hidden_state)
        dy = np.zeros_like(targets)
        dh = np.zeros_like(self.hidden_state)

        for t in reversed(range(self.input_size)):
            dy[t] = self.forward(inputs) - targets
            dW_hy += np.dot(dy[t], h.T)
            db_o += dy[t]
            dh = np.dot(self.W_hy.T, dy[t]) + dhidden_next
            dh_raw = self.dtanh(h) * dh
            db_h += dh_raw
            dW_xh += np.dot(dh_raw, inputs[t].T)
            dW_hh += np.dot(dh_raw, h.T)
            dhidden_next = np.dot(self.W_hh.T, dh_raw)

        return dW_xh, dW_hh, dW_hy, db_o, db_h

    
    def update_parameters(self, dW_xh, dW_hh, dW_hy, db_o, db_h, learning_rate=0.01):
        if self.optimizer == "GD":
            self.W_xh -= learning_rate * dW_xh
            self.W_hh -= learning_rate * dW_hh
            self.W_hy -= learning_rate * dW_hy
            self.b_h -= learning_rate * db_h
            self.b_o -= learning_rate * db_o
        elif self.optimizer == "AdaGrad":
            epsilon = 1e-8
            self.W_ih += -learning_rate * dW_ih / (np.sqrt(np.square(dW_ih) + epsilon))
            self.W_hh += -learning_rate * dW_hh / (np.sqrt(np.square(dW_hh) + epsilon))
            self.W_ho += -learning_rate * dW_ho / (np.sqrt(np.square(dW_ho) + epsilon))
            self.b_h += -learning_rate * db_h / (np.sqrt(np.square(db_h) + epsilon))
            self.b_o += -learning_rate * db_o / (np.sqrt(np.square(db_o) + epsilon))
            
        elif self.optimizer == "RMSprop":
            decay_rate = 0.9
            self.squared_gradient_W_ih = decay_rate * self.squared_gradient_W_ih + (1 - decay_rate) * np.square(dW_ih)
            self.W_ih -= learning_rate * dW_ih / (np.sqrt(self.squared_gradient_W_ih) + epsilon)
            
            self.squared_gradient_W_hh = decay_rate * self.squared_gradient_W_hh + (1 - decay_rate) * np.square(dW_hh)
            self.W_hh -= learning_rate * dW_hh / (np.sqrt(self.squared_gradient_W_hh) + epsilon)
            
            self.squared_gradient_W_ho = decay_rate * self.squared_gradient_W_ho + (1 - decay_rate) * np.square(dW_ho)
            self.W_ho -= learning_rate * dW_ho / (np.sqrt(self.squared_gradient_W_ho) + epsilon)

            self.squared_gradient_b_h = decay_rate * self.squared_gradient_b_h + (1 - decay_rate) * np.square(db_h)
            self.b_h -= learning_rate * db_h / (np.sqrt(self.squared_gradient_b_h) + epsilon)

            self.squared_gradient_b_o = decay_rate * self.squared_gradient_b_o + (1 - decay_rate) * np.square(db_o)
            self.b_o -= learning_rate * db_o / (np.sqrt(self.squared_gradient_b_o) + epsilon)
    @staticmethod        
    def tanh(x):
        return (np.exp(x) - np.exp(-x))/(np.exp(x) + np.exp(-x))
    @staticmethod 
    def dtanh(x):
        return 1 - self.tanh(x)**2
    
    def loss(self, predicted, target):
        #categorical cross entropy 
        predicted = predicted.T  
        loss = -np.sum(target * np.log(predicted + 1e-7)) #1e-7 to avoid zero division error
        return loss

    def fit(self, inputs, targets, learning_rate=0.01, epochs=100, batch_size=1):
        for epoch in range(epochs):
            Loss = 0 #total loss
            for i in range(0, len(inputs), batch_size):
                batch_inputs = inputs[i:i+batch_size]
                batch_targets = targets[i:i+batch_size]

                y = self.forward(batch_inputs)
                loss = self.loss(y, batch_targets)
                gradients = self.backward(batch_inputs, batch_targets)
                self.update_parameters(*gradients, learning_rate)
                Loss += loss

            average_loss = total_loss / (len(inputs) / batch_size)

            if (epoch + 1) % 10 == 0:
                print(f"Epoch: {epoch+1}, Loss: {average_loss}")

        

    def predict(self, inputs, targets):
        y = self.forward(inputs)
        loss = self.loss(y, targets)
        print(f"Test Loss: {loss}")
