# ***Engr.Muhammad Javed***

# 02. Forward and Backward Propagation in RNN

## Forward Propagation
As seen in the previous notebook, forward propagation is just looping through time steps and updating the hidden state.

## Backpropagation Through Time (BPTT)
Training an RNN is similar to training a traditional Neural Network, but with a twist. Because the parameters are shared across all time steps, the gradient at each output depends not only on the current time step but also on previous time steps.

This is called **Backpropagation Through Time (BPTT)**. It's essentially unrolling the RNN for all time steps and then using standard backpropagation.

### Loss Function
The total loss $L$ is the sum of the losses at each time step $t$:

$$L = \sum_t L_t$$

where $L_t$ is typically Cross-Entropy Loss for classification.

In [1]:
import numpy as np

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

# RNN Forward and Backward Pass Implementation
class RNN:
    def __init__(self, vocab_size, hidden_size, seq_length, learning_rate):
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.seq_length = seq_length
        self.learning_rate = learning_rate
        
        # Model parameters
        self.Wxh = np.random.randn(hidden_size, vocab_size) * 0.01
        self.Whh = np.random.randn(hidden_size, hidden_size) * 0.01
        self.Why = np.random.randn(vocab_size, hidden_size) * 0.01
        self.bh = np.zeros((hidden_size, 1))
        self.by = np.zeros((vocab_size, 1))
        
    def lossFun(self, inputs, targets, hprev):
        xs, hs, ys, ps = {}, {}, {}, {}
        hs[-1] = np.copy(hprev)
        loss = 0
        
        # Forward Pass
        for t in range(len(inputs)):
            xs[t] = np.zeros((self.vocab_size, 1))
            xs[t][inputs[t]] = 1 # One-hot encoding
            hs[t] = np.tanh(np.dot(self.Wxh, xs[t]) + np.dot(self.Whh, hs[t-1]) + self.bh)
            ys[t] = np.dot(self.Why, hs[t]) + self.by
            ps[t] = softmax(ys[t])
            loss += -np.log(ps[t][targets[t], 0])
            
        # Backward Pass
        dWxh, dWhh, dWhy = np.zeros_like(self.Wxh), np.zeros_like(self.Whh), np.zeros_like(self.Why)
        dbh, dby = np.zeros_like(self.bh), np.zeros_like(self.by)
        dhnext = np.zeros_like(hs[0])
        
        for t in reversed(range(len(inputs))):
            dy = np.copy(ps[t])
            dy[targets[t]] -= 1
            dWhy += np.dot(dy, hs[t].T)
            dby += dy
            dh = np.dot(self.Why.T, dy) + dhnext
            dhraw = (1 - hs[t] * hs[t]) * dh
            dbh += dhraw
            dWxh += np.dot(dhraw, xs[t].T)
            dWhh += np.dot(dhraw, hs[t-1].T)
            dhnext = np.dot(self.Whh.T, dhraw)
            
        for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
            np.clip(dparam, -5, 5, out=dparam) # Gradient Clipping
            
        return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

This code demonstrates the core mechanics of how gradients flow back through time steps (`reversed(range(len(inputs)))`).