##Single Cell

Where:
- $x_t$​: input at time t
- $h_t$​: hidden state at time t    
- $W_{ih}$​: input → hidden weights
- $W_{hh}$​: hidden → hidden weights
- $W_{ho}$​: hidden → output weights

$$
h_t = \tanh(W_{ih} \cdot x_t + W_{hh} \cdot h_{t-1} + b)\tag{input for next state}
$$

$$
y_t= W_{ho} \cdot h_t + c\tag{output}
$$

In [None]:
import torch
import torch.nn.functional as F

# Hyperparameters
input_size = 50   # vocabulary size
hidden_size = 128
output_size = 50  # same as vocab size
seq_length = 10
batch_size = 1

# Parameters with gradient tracking
W_ih = torch.randn(input_size, hidden_size, requires_grad=True) * 0.01
W_hh = torch.randn(hidden_size, hidden_size, requires_grad=True) * 0.01
b_h = torch.zeros(hidden_size, requires_grad=True)

W_ho = torch.randn(hidden_size, output_size, requires_grad=True) * 0.01
b_o = torch.zeros(output_size, requires_grad=True)

##Forward Pass

In [None]:
# Random input sequence: batch_size x seq_len x input_size (one-hot vectors)
x_seq = F.one_hot(torch.randint(0, input_size, (seq_length,)), num_classes=input_size).float()

h_t = torch.zeros(hidden_size)  # Initial hidden state

outputs = []
for t in range(seq_length):
    x_t = x_seq[t]
    h_t = torch.tanh(x_t @ W_ih + h_t @ W_hh + b_h)
    y_t = h_t @ W_ho + b_o
    outputs.append(y_t)

# Stack outputs into shape: (seq_len, output_size)
logits = torch.stack(outputs)

## Loss Calculation

In [None]:
# Random target (e.g. next char)
target = torch.randint(0, input_size, (seq_length,))
loss = F.cross_entropy(logits, target)

# Backpropagation
loss.backward()

## Optimizer

In [None]:
lr = 0.01
for param in [W_ih, W_hh, b_h, W_ho, b_o]:
    param.data -= lr * param.grad
    param.grad.zero_()