Minimal example of exponential growth of the trajectories.

In [None]:
from fast_deep_rnn.core import *
from fast_deep_rnn.common_core import *
# from fast_deep_rnn.core_v2 import *
from fast_deep_rnn.sort_training import *

from typing import List, Optional, Union, Callable
import numpy as np


In [None]:
class RNNCell(Module):
    def __init__(self, state_size: int, hidden_size: int):
        super().__init__()
        self.linear1 = LinearLayer(state_size, hidden_size)
        self.linear2 = LinearLayer(state_size, hidden_size)
        self.tanh = TanhFunction()
        self.hstack = HStack()
        self.sum = Sum()
        self.register_parameters([self.linear1, self.linear2])

    def forward(self, x: Tensor, h_t_1: Optional[Tensor] = None):
        X = self.hstack(x, h_t_1)
        z1 = self.linear1(X)
        z2 = self.linear2(X)
        z = self.sum(z1, z2)
        h_t = self.tanh(z)
        return h_t

In [None]:
class RNN(Module):
    def __init__(self, input_size: int, hidden_size: int):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.state_size = input_size + hidden_size
        self.rnn = RNNCell(self.state_size, hidden_size)
        self. row = Row()
        self.vstack = VStack()
        self.register_parameters([self.rnn])

    def forward(self, x: Tensor, h_t_1: Optional[Tensor] = None):
        seq_len, batch_size, input_size = x.shape
        # print(f'seq_len: {seq_len} batch_size: {batch_size} input_size: {input_size}')
        h = Tensor(np.zeros((0, batch_size, self.hidden_size)), name="h")
        if h_t_1 is None:
            h_t_1 = Tensor(np.zeros((batch_size, self.hidden_size)), name="h_t_1")
        for idx in range(seq_len):
            h_t_1 = self.rnn.forward(self.row(x, idx), h_t_1)
            h = self.vstack(h, h_t_1.reshape((1, batch_size, self.hidden_size)))
        return h

In [None]:
class RecurrentNetwork(Module):
    def __init__(self, vocab_size: int, emb_size: int, hidden_size: int):
        super().__init__()
        self.emb_size = emb_size
        self.hidden_size = hidden_size
        self.embedding = Embedding(vocab_size, emb_size)
        self.rnn = RNN(emb_size, hidden_size)
        self.linear = LinearLayer(hidden_size, vocab_size)
        xavier_(self.linear.parameters)
        self.register_parameters([self.embedding, self.rnn, self.linear])

    def forward(self, x: Tensor):
        emb = self.embedding(x)
        rnn_out = self.rnn(emb)
        linear_out = self.linear(rnn_out.reshape(-1, self.hidden_size))
        return linear_out

In [None]:
emb_size = 20
hidden_size = 32
vocab_size = len(vocab)

max_number = 10

model = RecurrentNetwork(vocab_size, emb_size, hidden_size)
optimizer = SGD(model.parameters, lr=1.0)
loss_function = CrossEntropyLoss()

## Gradient check

In [None]:

num_examples = 5
seq_len = 2

X_val, y_val = get_examples(seq_len, num_examples, max_number)
X_val, y_val = X_val.transpose(1, 0), y_val.transpose(1, 0)

In [None]:
import time


loss_function = CrossEntropyLoss()
model_ = RecurrentNetwork(vocab_size, emb_size, hidden_size)
dJ_theta_tensors = dJ_theta_global(model_, loss_function, Tensor(X_val), Tensor(y_val))
global_start = time.time()
for i, parameter in enumerate(model_.parameters):
    start = time.time()
    print(f'[{i}]: Start -- {parameter.__name__}')
    def J_theta(theta, idx=i, x=Tensor(X_val), y=Tensor(y_val)):
        return J_theta_global(model_, loss_function, theta, idx, x, y)
    gradient_checker(J_theta, dJ_theta_tensors[i], parameter.data)
    print(f'[{i}]: Elapsed time: {time.time() - start:.1f}s')
print(f'Total elapsed time: {time.time() - global_start:.1f}s')

[0]: Start -- E


Gradient check passed
[0]: Elapsed time: 0.4s
[1]: Start -- weights
Gradient check passed
[1]: Elapsed time: 3.8s
[2]: Start -- bias
Gradient check passed
[2]: Elapsed time: 0.1s
[3]: Start -- weights
Gradient check passed
[3]: Elapsed time: 2.1s
[4]: Start -- bias
Gradient check passed
[4]: Elapsed time: 0.0s
[5]: Start -- weights
Gradient check passed
[5]: Elapsed time: 0.4s
[6]: Start -- bias
Gradient check passed
[6]: Elapsed time: 0.0s
Total elapsed time: 6.7s


## Benchmarking training and evaluating time step:


In [None]:
num_examples = 1
seq_len = 2

X_val, y_val = get_examples(seq_len, num_examples, max_number)
X_val, y_val = X_val.transpose(1, 0), y_val.transpose(1, 0)
X_val, y_val = Tensor(X_val), Tensor(y_val)

print(X_val.shape)


(2, 1)


In [None]:
%%timeit
# inference step
outputs = model(X_val)

327 µs ± 64.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [None]:
%%timeit
# training step
outputs = model(X_val)
optimizer.zero_grad()
loss = loss_function(outputs, y_val)
loss.backward()
optimizer.step()

1.29 ms ± 253 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [None]:
seq_range = range(2, 8)

In [None]:
import timeit


for seq_len in seq_range:
    X_val, y_val = get_examples(seq_len, num_examples, max_number)
    X_val, y_val = X_val.transpose(1, 0), y_val.transpose(1, 0)
    X_val, y_val = Tensor(X_val), Tensor(y_val)
    spent_time = timeit.timeit("model(X_val)", globals=globals(), number=1000)
    print(seq_len, spent_time, sep="\t")

2	0.8717286000028253
3	0.8740272999857552
4	1.0663329999952111
5	1.903685500001302
6	1.3410909000085667
7	1.8457453000009991


In [None]:


for seq_len in seq_range:
    X_val, y_val = get_examples(seq_len, num_examples, max_number)
    X_val, y_val = X_val.transpose(1, 0), y_val.transpose(1, 0)
    X_val, y_val = Tensor(X_val), Tensor(y_val)
    spent_time = timeit.timeit("""
outputs = model(X_val)
optimizer.zero_grad()
loss = loss_function(outputs, y_val)
loss.backward()
""", globals=globals(), number=1000)
    print(seq_len, spent_time, sep="\t")

2	1.7819602999952622
3	2.65656190001755
4	4.4321885999816
5	11.626309499988565
6	15.225644200021634
7	30.876693199999863


## Training

In [None]:
num_epochs = 1
vocab_size = len(vocab)
emb_size = 20
hidden_size = 32
batch_size = 100
dataloader = DataLoader(dataset_inputs, dataset_targets, batch_size=batch_size)
model = RecurrentNetwork(vocab_size, emb_size, hidden_size)
loss_function = CrossEntropyLoss()
optimizer = SGD(model.parameters, lr=1.0)
# optimizer = Adam(model.parameters, alpha=0.1, beta1=0.9, beta2=0.999, eps=1e-8, weight_decay=0.01)
scheduler = ConstantLR(optimizer)

In [None]:
model.size()

4028

In [None]:
losses = []
accuracies = []
lrs = []
for epoch in range(num_epochs):
    loss_sum = 0
    for data in dataloader():
        optimizer.zero_grad()
        inputs, targets = data
        inputs = inputs.transpose(1, 0)
        targets = targets.transpose(1, 0)
        outputs = model(inputs)
        loss = loss_function(outputs, targets)
        loss.backward()
        optimizer.step()
        loss_sum += loss.data
    acc = eval_accuracy(model, inputs.data, targets.data)
    print(f'\r epoch: [{epoch+1}/{num_epochs}], loss: {loss_sum}, acc: {acc}', end='')
    losses.append(loss_sum)
    accuracies.append(acc)
    lrs.append(scheduler.lr)
    scheduler.step()

 epoch: [1/1], loss: 16.224544255535776, acc: 0.68125