Pytorch Implementation of the RNN

In [92]:
import torch
import torch.nn as nn
import torch.optim as optim

# ----------------------------------------------------
# 1. Data Preparation
# ----------------------------------------------------
text = """To be, or not to be, that is the question:
Whether 'tis nobler in the mind to suffer
The slings and arrows of outrageous fortune,
Or to take arms against a sea of troubles,
And by opposing end them."""

# Make lowercase and build character vocabulary
chars = sorted(list(set(text.lower())))
vocab_size = len(chars)

# char to index mappings
char_to_idx = { ch:i for i,ch in enumerate(chars) }
idx_to_char = { i:ch for i,ch in enumerate(chars) }

# Encode the text as indices
data = torch.tensor([char_to_idx[ch] for ch in text.lower()], dtype=torch.long)

# ----------------------------------------------------
# 2. Hyperparameters
# ----------------------------------------------------
embed_size = 16
hidden_size = 64
lr = 0.01
epochs = 500

# ----------------------------------------------------
# 3. Model
# ----------------------------------------------------
class CharRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(CharRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.i2h = nn.Linear(embed_size + hidden_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, vocab_size)
        self.tanh = nn.Tanh()
        
    def forward(self, x, hidden):
        emb = self.embed(x)
        combined = torch.cat((emb, hidden), dim=1)
        hidden = self.tanh(self.i2h(combined))
        output = self.h2o(hidden)
        return output, hidden

    def init_hidden(self):
        return torch.zeros(1, hidden_size)

model = CharRNN(vocab_size, embed_size, hidden_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

# ----------------------------------------------------
# 4. Training
# ----------------------------------------------------
seq_length = 30  # number of characters to unroll

for epoch in range(1, epochs+1):
    hidden = model.init_hidden()
    total_loss = 0
    
    # run through data in steps of seq_length
    for i in range(0, len(data) - seq_length):
        inputs = data[i:i+seq_length]
        targets = data[i+1:i+seq_length+1]
        
        optimizer.zero_grad()
        
        loss = 0
        for j in range(seq_length):
            output, hidden = model(inputs[j].unsqueeze(0), hidden)
            hidden = hidden.detach()
            loss += criterion(output, targets[j].unsqueeze(0))
        
        loss.backward()
        optimizer.step()
        total_loss += loss.item() / seq_length
    
    if epoch % 50 == 0 or epoch == 1:
        print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

# ----------------------------------------------------
# 5. Sampling (text generation)
# ----------------------------------------------------
def sample(start_str="to ", length=200):
    model.eval()
    hidden = model.init_hidden()
    input_char = torch.tensor([char_to_idx[ch] for ch in start_str], dtype=torch.long)
    result = start_str
    
    # warm-up on the initial string
    for ch in input_char:
        output, hidden = model(ch.unsqueeze(0), hidden)
    
    idx = torch.multinomial(torch.softmax(output, dim=1), 1).item()
    result += idx_to_char[idx]
    
    for _ in range(length):
        output, hidden = model(torch.tensor([[idx]]), hidden)
        idx = torch.multinomial(torch.softmax(output, dim=1), 1).item()
        result += idx_to_char[idx]
    
    return result

print("\nSample generation:\n")
print(sample("to ", 300))


Epoch 1, Loss: 233.7712


KeyboardInterrupt: 

Using embedding for practice instead of one-hot.

In [84]:
class RNN(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super().__init__()
        self.rnn = nn.RNN(vocab_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, x):
        x, _ = self.rnn(x)
        return self.fc(x)

In [89]:
hidden_size = 64
output_size = len(l)
vocab_size = 22
sequence_length = 20

lr = 0.0005
epochs = 250

model = RNN(vocab_size, hidden_size)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr)

In [86]:
#loss.backward()
#torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
#optimizer.step()

In [90]:
for epoch in range(epochs):
    total_loss = 0
    count = 0
    for xb, yb in loader:
        logits = model(xb)
        logits = logits.reshape(20, 22)

        optimizer.zero_grad()
        loss = loss_fn(logits, yb.reshape(20))
        total_loss += loss
        count += 1
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()
    print(f"Epoch {epoch} loss: {total_loss / count}")


Epoch 0 loss: 3.0964248180389404
Epoch 1 loss: 3.0840961933135986
Epoch 2 loss: 3.072006940841675
Epoch 3 loss: 3.060065269470215
Epoch 4 loss: 3.0481855869293213
Epoch 5 loss: 3.036295175552368
Epoch 6 loss: 3.0243287086486816
Epoch 7 loss: 3.0122263431549072
Epoch 8 loss: 2.999936580657959
Epoch 9 loss: 2.9874114990234375
Epoch 10 loss: 2.974623203277588
Epoch 11 loss: 2.9615511894226074
Epoch 12 loss: 2.948195457458496
Epoch 13 loss: 2.9345884323120117
Epoch 14 loss: 2.9207844734191895
Epoch 15 loss: 2.906874418258667
Epoch 16 loss: 2.8929929733276367
Epoch 17 loss: 2.8792922496795654
Epoch 18 loss: 2.8659517765045166
Epoch 19 loss: 2.8531391620635986
Epoch 20 loss: 2.841012954711914
Epoch 21 loss: 2.8296446800231934
Epoch 22 loss: 2.8190829753875732
Epoch 23 loss: 2.8093295097351074
Epoch 24 loss: 2.8003342151641846
Epoch 25 loss: 2.7920169830322266
Epoch 26 loss: 2.7842965126037598
Epoch 27 loss: 2.777127981185913
Epoch 28 loss: 2.7704594135284424
Epoch 29 loss: 2.7642486095428467

In [None]:
import numpy as np

wx = np.random.rand(64, 22) *0.01   # (64, 22)
x = np.random.rand(22,)
(x @ wx.T).shape

(64,)