In [None]:
import numpy as np

def sigmoid(input, deriv=False):
    if deriv:
        return input*(1-input)
    else:
        return 1 / (1 + np.exp(-input))

def tanh(input, deriv=False):
    if deriv:
        return 1 - input ** 2
    else:
        return np.tanh(input)

# Derivative is directly calculated in backprop (in combination with cross-entropy loss function).
def softmax(input):
    # Subtraction of max value improves numerical stability.
    e_input = np.exp(input - np.max(input))
    return e_input / e_input.sum()


class GRUModel:
    def __init__(self, vocab_size, hidden_size, output_size):
        self.vocab_size = vocab_size
        self.h_size = hidden_size  # hidden layer size
        self.o_size = output_size  # output size
        self.learning_rate = 1e-1

        # Model parameters
        self.init_parameters()

        # AdaGrad memory
        self.init_adagrad()

    def init_parameters(self):
        # Initialize weights and biases for the gates and transformations
        self.Wz = np.random.rand(self.h_size, self.vocab_size) * 0.1 - 0.05
        self.Uz = np.random.rand(self.h_size, self.h_size) * 0.1 - 0.05
        self.bz = np.zeros((self.h_size, 1))

        self.Wr = np.random.rand(self.h_size, self.vocab_size) * 0.1 - 0.05
        self.Ur = np.random.rand(self.h_size, self.h_size) * 0.1 - 0.05
        self.br = np.zeros((self.h_size, 1))

        self.Wh = np.random.rand(self.h_size, self.vocab_size) * 0.1 - 0.05
        self.Uh = np.random.rand(self.h_size, self.h_size) * 0.1 - 0.05
        self.bh = np.zeros((self.h_size, 1))

        self.Wy = np.random.rand(self.o_size, self.h_size) * 0.1 - 0.05
        self.by = np.zeros((self.o_size, 1))

    def init_adagrad(self):
        self.mdWy = np.zeros_like(self.Wy)
        self.mdWh = np.zeros_like(self.Wh)
        self.mdWr = np.zeros_like(self.Wr)
        self.mdWz = np.zeros_like(self.Wz)
        self.mdUh = np.zeros_like(self.Uh)
        self.mdUr = np.zeros_like(self.Ur)
        self.mdUz = np.zeros_like(self.Uz)
        self.mdby = np.zeros_like(self.by)
        self.mdbh = np.zeros_like(self.bh)
        self.mdbr = np.zeros_like(self.br)
        self.mdbz = np.zeros_like(self.bz)

    def forward_pass(self, inputs, hprev):
        x, z, r, h_hat, h, y, p = {}, {}, {}, {}, {-1: hprev}, {}, {} # Dictionaries contain variables for each timestep.
        cntt = 0
        # Forward prop
        for eachInput in inputs:
            for t in range(len(eachInput)):
                # Set up one-hot encoded input
                x[t] = np.zeros((self.vocab_size, 1))
                x[t][eachInput[t]] = 1
                
                # Calculate update and reset gates
                z[t] = sigmoid(np.dot(self.Wz, x[t]) + np.dot(self.Uz, h[t-1]) + self.bz)
                r[t] = sigmoid(np.dot(self.Wr, x[t]) + np.dot(self.Ur, h[t-1]) + self.br)
                
                # Calculate hidden units
                h_hat[t] = tanh(np.dot(self.Wh, x[t]) + np.dot(self.Uh, np.multiply(r[t], h[t-1])) + self.bh)
                h[t] = np.multiply(z[t], h[t-1]) + np.multiply((1 - z[t]), h_hat[t])
            
            # Regular output unit
            y[cntt] = np.dot(self.Wy, h[t]) + self.by
        
            # Probability distribution
            p[cntt] = softmax(y[cntt])

    def calculate_loss(self, targets, outputs):
        # Calculate and return the loss
        pass

    def backward_pass(self, targets, dhnext):
        # Perform the backward pass and calculate gradients
        pass

    def update_parameters(self):
        # Update model parameters with gradients using AdaGrad
        pass

    def reset_state(self):
        # Reset the hidden state
        pass

    def generate_text(self, seed_ix, n):
        # Generate text starting from a seed index
        pass

    def train(self, data, char_to_ix, n_iters=1000):
        # Training loop
        pass