In [15]:
# Recurrent Neural Network 
import sys
sys.path.append('..')
import numpy as np 
from activation.tanh import Tanh  
from activation.Softmax import softmax

In [16]:
def initialize_params(n_features, n_hidden, n_output):
    # Xavier initialization
    Wxh = np.random.randn(n_hidden, n_features) / np.sqrt(n_features)
    Whh = np.random.randn(n_hidden, n_hidden) / np.sqrt(n_hidden)
    Why = np.random.randn(n_output, n_hidden) / np.sqrt(n_hidden)

    bh = np.zeros((n_hidden, 1))
    by = np.zeros((n_output, 1))
    
    return Wxh, Whh, Why, bh, by

In [17]:
def forward(X, Wxh, Whh, Why, bh, by):
    # many to one RNN
    h_prev = np.zeros((Whh.shape[0], 1))  # Initial hidden state
    hs = []
    for t in range(X.shape[0]):
        x_t = X[t].reshape(-1, 1)  # Column vector
        h_t = Tanh().forward(np.dot(Wxh, x_t) + np.dot(Whh, h_prev) + bh)
        hs.append(h_t)
        h_prev = h_t
    
    y = softmax(np.dot(Why, hs[-1]) + by)
    return y, hs

In [18]:
def clip_gradients(dWxh, dWhh, dWhy, dbh, dby, clip_value=5):
    for d in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(d, -clip_value, clip_value, out=d)
    return dWxh, dWhh, dWhy, dbh, dby


In [19]:
def backward(X, y_true, y_pred, hs, Wxh, Whh, Why, bh):
    """
    Backpropagation through time (BPTT)
    
    Args:
        X: input sequence of shape (seq_len, input_size)
        y_true: true output (output_size, 1)
        y_pred: predicted output (output_size, 1)
        hs: list of hidden states, each of shape (hidden_size, 1)
        Wxh, Whh, Why: weight matrices
        bh: hidden bias (hidden_size, 1)
        
    Returns:
        Gradients: dWxh, dWhh, dWhy, dbh, dby
    """
    seq_len = X.shape[0]
    hidden_size = hs[0].shape[0]

    # Initialize gradients
    dWxh = np.zeros_like(Wxh)
    dWhh = np.zeros_like(Whh)
    dWhy = np.dot((y_pred - y_true), hs[-1].T)
    dbh = np.zeros_like(bh)
    dby = y_pred - y_true
    dh_next = np.zeros_like(hs[0])

    for t in reversed(range(seq_len)):
        dh = np.dot(Why.T, dby) + dh_next
        dh_raw = Tanh.derivative(hs[t]) * dh  

        dbh += dh_raw
        dWxh += np.dot(dh_raw, X[t].reshape(1, -1))  # (hidden, input)
        if t > 0:
            dWhh += np.dot(dh_raw, hs[t-1].T)

        # Update dh_next for next timestep (propagate backward)
        dh_next = np.dot(Whh.T, dh_raw)
        
    # Clip gradients to avoid exploding gradients    
    dWxh, dWhh, dWhy, dbh, dby = clip_gradients(dWxh, dWhh, dWhy, dbh, dby, clip_value=5)
    return dWxh, dWhh, dWhy, dbh, dby


In [20]:
def update_params(Wxh, Whh, Why, bh, by, dWxh, dWhh, dWhy, dbh, dby, eta, clip_value=5):
    # Clip gradients to avoid exploding gradients
    for d in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(d, -clip_value, clip_value, out=d)

    Wxh -= eta * dWxh
    Whh -= eta * dWhh
    Why -= eta * dWhy
    bh  -= eta * dbh
    by  -= eta * dby
    return Wxh, Whh, Why, bh, by


In [21]:
def sgd(X, y_true, Wxh, Whh, Why, bh, by, eta):
    y_pred, hs = forward(X, Wxh, Whh, Why, bh, by)
    dWxh, dWhh, dWhy, dbh, dby = backward(X, y_true, y_pred, hs, Wxh, Whh, Why, bh)
    Wxh, Whh, Why, bh, by = update_params(Wxh, Whh, Why, bh, by, dWxh, dWhh, dWhy, dbh, dby, eta)
    return Wxh, Whh, Why, bh, by

In [22]:
def predict(X, Wxh, Whh, Why, bh, by):
    y_pred, _ = forward(X, Wxh, Whh, Why, bh, by)
    return y_pred