In [1]:
import math
import torch
from torch import nn
from torch.nn import functional as F
from d2l import torch as d2l_torch

In [2]:
batch_size, num_steps = 32, 35
train_iter, vocab = d2l_torch.load_data_time_machine(batch_size=batch_size, num_steps=num_steps)

In [3]:
# Create layers params
def get_params(vocab_size, num_hiddens, device):
    num_inputs = num_outputs = vocab_size

    def normal3(num_inputs, num_hiddens):
        return torch.randn(size=(num_inputs, num_hiddens), device=device) * 0.01, torch.randn(size=(num_hiddens, num_hiddens), device=device) * 0.01, torch.zeros(num_hiddens, device=device)

    def normal(shape):
        return torch.randn(size=shape, device=device) * 0.01

    # Hidden Layer Params
    ## Input gate params
    W_xi, W_hi, b_i = normal3(num_inputs, num_hiddens)
    ## Forget gate params
    W_xf, W_hf, b_f = normal3(num_inputs, num_hiddens)
    ## Output gate params
    W_xo, W_ho, b_o = normal3(num_inputs, num_hiddens)
    ## Candidate memory weight params
    W_xc, W_hc, b_c = normal3(num_inputs, num_hiddens)
    # Output Layer Params
    W_hq = normal((num_hiddens, num_outputs))
    b_q = torch.zeros(num_outputs, device=device)
    # Attach gradient for params
    params = [W_xi, W_hi, b_i, W_xf, W_hf, b_f, W_xo, W_ho, b_o, W_xc, W_hc, b_c, W_hq, b_q]
    for param in params:
        param.requires_grad_(True)
    return params

In [4]:
# State = (Hidden state H, Candidate memory C)
def init_state(batch_size, num_hiddens, device):
    return (torch.zeros(size=(batch_size, num_hiddens), device=device), torch.zeros(size=(batch_size, num_hiddens), device=device))

In [5]:
def lstm_forward(inputs, state, params):
    # Unpacking params
    W_xi, W_hi, b_i, W_xf, W_hf, b_f, W_xo, W_ho, b_o, W_xc, W_hc, b_c, W_hq, b_q = params
    # Unpacking state
    H, C = state
    # outputs
    outputs = []
    for X in inputs:
        # Run X and H through gates
        I = torch.sigmoid((X @ W_xi) + (H @ W_hi) + b_i)
        F = torch.sigmoid((X @ W_xf) + (H @ W_hf) + b_f)
        O = torch.sigmoid((X @ W_xo) + (H @ W_ho) + b_o)
        C_candidate = torch.tanh((X @ W_xc) + (H @ W_hc) + b_c)
        # Update state
        C = F * C + I * C_candidate
        H = O * torch.tanh(C)
        # Run H through output layer to get Y
        Y = torch.mm(H, W_hq) + b_q
        outputs.append(Y)
    return torch.cat(outputs, dim=0), (H, C)

In [6]:
class LSTM:
    def __init__(self, forward_fn, init_state, num_hiddens, vocab_size, device) -> None:
        self.params = get_params(vocab_size, num_hiddens, device=device)
        self.forward_fn = forward_fn
        self.init_state = init_state
        self.num_hiddens = num_hiddens
        self.vocab_size = vocab_size
    
    def __call__(self, X, state):
        X = F.one_hot(X.T, self.vocab_size).type(torch.float32)
        return self.forward_fn(X, state, self.params)

    def begin_state(self, batch_size, device):
        return self.init_state(batch_size, self.num_hiddens, device=device)

In [7]:
def grad_clipping(net, theta):
    params = net.params
    norm = torch.sqrt(sum(torch.sum((p.grad**2)) for p in params))
    if norm > theta:
        for param in params:
            param.grad[:] *= theta / norm

In [8]:
def train_epoch(net, train_iter, loss, updater, device):
    state = None
    metric = d2l_torch.Accumulator(2)
    for X, Y in train_iter:
        if state is None:
            state = net.begin_state(batch_size=X.shape[0], device=device)
        else:
            for s in state:
              s.detach_()
        y = Y.T.reshape(-1)
        X, y = X.to(device), y.to(device)
        Y_hat, state = net(X, state)
        l = loss(Y_hat, y.long()).mean()
        l.backward()
        grad_clipping(net, 1)
        updater(batch_size=1)
        metric.add(l*y.numel(), y.numel())
    return math.exp(metric[0]/metric[1])

In [9]:
def train_lstm(net, train_iter, num_epochs, learning_rate):
    loss = nn.CrossEntropyLoss()
    updater = lambda batch_size: d2l_torch.sgd(net.params, learning_rate, batch_size)
    for epoch in range(num_epochs):
        perplexity = train_epoch(net, train_iter, loss, updater, d2l_torch.try_gpu())
        if(epoch % 50 == 0):
            print(f"Epoch: {epoch:d}/{num_epochs:d}| Perplexity: {perplexity:.2f}")
    print(f"Perplexity: {perplexity:.2f}")

In [10]:
def predict(net, prefix, num_preds, device):
    state = net.begin_state(batch_size=1, device=device)
    outputs = [vocab[prefix[0]]]
    get_input = lambda: torch.tensor([outputs[-1]], device=device).reshape((1, 1))
    for y in prefix[1:]:  # Warm-up period
        _, state = net(get_input(), state)
        outputs.append(vocab[y])
    for _ in range(num_preds):  # Predict `num_preds` steps
        y, state = net(get_input(), state)
        outputs.append(int(y.argmax(dim=1).reshape(1)))
    return ''.join([vocab.idx_to_token[i] for i in outputs])

In [11]:
num_epochs, lr = 1000, 0.1
num_hiddens = 512

In [12]:
net = LSTM(forward_fn=lstm_forward, 
            init_state=init_state, 
            vocab_size=len(vocab), 
            num_hiddens=num_hiddens, 
            device=d2l_torch.try_gpu())

In [13]:
train_lstm(net, train_iter=train_iter, num_epochs=num_epochs, learning_rate=lr)

Epoch: 0/1000| Perplexity: 27.63
Epoch: 50/1000| Perplexity: 18.40
Epoch: 100/1000| Perplexity: 17.75
Epoch: 150/1000| Perplexity: 17.46
Epoch: 200/1000| Perplexity: 17.15
Epoch: 250/1000| Perplexity: 16.72
Epoch: 300/1000| Perplexity: 16.16
Epoch: 350/1000| Perplexity: 15.59
Epoch: 400/1000| Perplexity: 15.02
Epoch: 450/1000| Perplexity: 14.33
Epoch: 500/1000| Perplexity: 13.49
Epoch: 550/1000| Perplexity: 12.36
Epoch: 600/1000| Perplexity: 11.86
Epoch: 650/1000| Perplexity: 11.43
Epoch: 700/1000| Perplexity: 11.01
Epoch: 750/1000| Perplexity: 10.75
Epoch: 800/1000| Perplexity: 10.60
Epoch: 850/1000| Perplexity: 10.10
Epoch: 900/1000| Perplexity: 9.93
Epoch: 950/1000| Perplexity: 9.65
Perplexity: 9.45


With LSTM, loss reduction is slow, for it has lots of different params.

In [14]:
predict(net, "time traveller ", 20, d2l_torch.try_gpu())

'time traveller and and and and and '

In [16]:
net1 = LSTM(forward_fn=lstm_forward, 
            init_state=init_state, 
            vocab_size=len(vocab), 
            num_hiddens=num_hiddens, 
            device=d2l_torch.try_gpu())

In [17]:
train_lstm(net1, train_iter=train_iter, num_epochs=700, learning_rate=1)

Epoch: 0/700| Perplexity: 25.00
Epoch: 50/700| Perplexity: 13.66
Epoch: 100/700| Perplexity: 10.35
Epoch: 150/700| Perplexity: 8.92
Epoch: 200/700| Perplexity: 7.57
Epoch: 250/700| Perplexity: 6.29
Epoch: 300/700| Perplexity: 4.85
Epoch: 350/700| Perplexity: 3.25
Epoch: 400/700| Perplexity: 1.86
Epoch: 450/700| Perplexity: 1.25
Epoch: 500/700| Perplexity: 1.10
Epoch: 550/700| Perplexity: 1.05
Epoch: 600/700| Perplexity: 1.05
Epoch: 650/700| Perplexity: 1.04
Perplexity: 1.04


In [19]:
predict(net1, "time traveller ", 100, d2l_torch.try_gpu())

'time traveller for so it will be convenient to speak of himwas expounding a recondite matter to us his grey eyes sh'

Pretty good

Let's try Adam on this model to see if we can train faster

In [56]:
def init_adam_states(feature_dim):
    v_w, v_b = torch.zeros(size=(feature_dim, 1), device=d2l_torch.try_gpu()), torch.zeros(1, device=d2l_torch.try_gpu())
    s_w, s_b = torch.zeros(size=(feature_dim, 1), device=d2l_torch.try_gpu()), torch.zeros(1, device=d2l_torch.try_gpu())
    return ((v_w, s_w), (v_b, s_b))

In [60]:
class Adam:
    def __init__(self, params, hyperparams):
        self.states = init_adam_states(num_hiddens)
        self.params = params
        self.hyperparams = hyperparams

    def __call__(self, batch_size):
        beta1, beta2, eps = 0.9, 0.999, 1e-6
        for p, (v, s) in zip(self.params, self.states):
            with torch.no_grad():
                v[:] = beta1 * v + (1 - beta1) * p.grad
                s[:] = beta2 * s + (1 - beta2) * torch.square(p.grad)
                v_bias_corr = v / (1 - beta1 ** self.hyperparams['t'])
                s_bias_corr = s / (1 - beta2 ** self.hyperparams['t'])
                p[:] -= self.hyperparams['lr'] * v_bias_corr / (torch.sqrt(s_bias_corr)+ eps)
            p.grad.data.zero_()
        self.hyperparams['t'] += 1

In [61]:
def train_lstm_with_adam(net, train_iter, num_epochs, learning_rate):
    loss = nn.CrossEntropyLoss()
    updater = Adam(net.params, hyperparams = {'lr': 0.1, 't': 1})
    for epoch in range(num_epochs):
        perplexity = train_epoch(net, train_iter, loss, updater, d2l_torch.try_gpu())
        if(epoch % 50 == 0):
            print(f"Epoch: {epoch:d}/{num_epochs:d}| Perplexity: {perplexity:.2f}")
    print(f"Perplexity: {perplexity:.2f}")

In [30]:
net2 = net1 = LSTM(forward_fn=lstm_forward, 
            init_state=init_state, 
            vocab_size=len(vocab), 
            num_hiddens=num_hiddens, 
            device=d2l_torch.try_gpu())

In [62]:
train_lstm_with_adam(net2, train_iter, num_epochs, learning_rate=0.1)

RuntimeError: The size of tensor a (512) must match the size of tensor b (28) at non-singleton dimension 0