In [29]:
from data_rnn import load_ndfa
import matplotlib.pyplot as plt
import numpy as np
from time import time

In [30]:
import torch

In [31]:
import torch.nn as nn

In [32]:
import torch.nn.functional as F

In [33]:
import torch.optim as optim

In [34]:
import torch.distributions as dist

In [35]:
n=150000
x_train, (i2w, w2i) = load_ndfa(n)

In [36]:
print(f'Dictionary:{w2i}')
print(f'Index:{i2w}')

Dictionary:{'.pad': 0, '.start': 1, '.end': 2, '.unk': 3, '!': 4, 'v': 5, 'b': 6, 'u': 7, 'k': 8, 's': 9, 'c': 10, 'm': 11, 'a': 12, 'l': 13, 'w': 14}
Index:['.pad', '.start', '.end', '.unk', '!', 'v', 'b', 'u', 'k', 's', 'c', 'm', 'a', 'l', 'w']


In [37]:
def print_sequence(x_train, i):
    print(f"Sequence #{str(i).rjust(6, ' ')}: {''.join([i2w[i] for i in x_train[i]])}")

In [38]:
for i in np.random.randint(n, size=10):
    print_sequence(x_train, i)

Sequence #132514: sabc!abc!abc!abc!abc!abc!abc!s
Sequence #  8670: ss
Sequence #  2504: ss
Sequence #110868: sabc!abc!abc!abc!s
Sequence # 98074: sklm!klm!klm!s
Sequence # 52135: sklm!s
Sequence # 62973: sabc!s
Sequence # 67071: suvw!uvw!s
Sequence # 16084: ss
Sequence #123280: sklm!klm!klm!klm!klm!s


In [39]:
x_train[74191]

[9, 12, 6, 10, 4, 12, 6, 10, 4, 9]

In [40]:
vocab_size=len(w2i)

In [41]:
embedding_size = 32
hidden_size = 16
lstm_num_layers = 1

In [66]:
class Net(nn.Module):
    def __init__(self,
                 vocab_size,
                 embedding_size,
                 hidden_size,
                 lstm_num_layers) -> None:
        super().__init__()

        self.embed = nn.Embedding(vocab_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, lstm_num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        input = self.embed(x)
        lstm_output, (hn, cn) = self.lstm(input)
        output = self.linear(lstm_output)
        return output

In [67]:
net =  Net(vocab_size, embedding_size, hidden_size, lstm_num_layers)

In [68]:
net


Net(
  (embed): Embedding(15, 32)
  (lstm): LSTM(32, 16, batch_first=True)
  (linear): Linear(in_features=16, out_features=15, bias=True)
)

In [88]:
def batch_length(batch):
    return max(len(seq) for seq in batch)

In [89]:
def add_padding(seq, amt=1):
    for _ in range(amt):
        seq.append(w2i['.pad'])
    return seq

In [90]:
def add_start(seq):
    seq.insert(0, w2i['.start'])
    return seq

In [91]:
def add_end(seq):
    seq.append(w2i['.end'])
    return seq

In [204]:
def preprocess_batch(batch):
    max_len = batch_length(batch)

    upd_batch = []
    for i, _ in enumerate(batch):
        seq = batch[i].copy()
        seq = add_start(seq)
        seq = add_end(seq)
        seq = add_padding(seq, amt=max_len + 2 - len(seq))
        upd_batch.append(seq)

    upd_batch = torch.tensor(upd_batch, dtype=torch.long)
    targets = upd_batch.clone().detach()
    # Remove first column of tensor
    targets = targets[:, 1:]
    # Append a zero column
    m = nn.ZeroPad2d((0, 1, 0, 0))
    targets = m(targets)

    return upd_batch, targets

In [205]:
def batch_generator(data, batch_size=128):
    data = np.array(data)

    indx = np.random.permutation((len(data)))
    n_batches = int(len(data) / batch_size) + 1

    for i in range(n_batches):
        bound_l = batch_size*i
        bound_r = batch_size*(i+1) if i + 1 < n_batches else len(indx)

        batch_ind = indx[bound_l:bound_r]
        batch = data[batch_ind]

        yield preprocess_batch(batch)


In [206]:
# Buggy, misses the last sequence
def batch_generator_max_tokens(data, max_tokens):
    cur_batch_start = 0
    batches = []
    while cur_batch_start < len(data):
        cur_batch_end, cur_batch = get_next_batch(cur_batch_start, data, max_tokens)
        cur_batch_start = cur_batch_end + 1
        batches.append(cur_batch)
    return batches

def get_next_batch(start, data, max_tokens):
    tokens = 0
    cur = start
    while (tokens <= max_tokens) and cur < len(data):
        s = data[cur]
        tokens += len(s)
        cur += 1
    end = cur - 1
    return end - 1, data[start:end]

In [207]:
batch_t, target_t = preprocess_batch(x_train[:5])
batch_t, target_t

(tensor([[1, 9, 9, 2],
         [1, 9, 9, 2],
         [1, 9, 9, 2],
         [1, 9, 9, 2],
         [1, 9, 9, 2]]),
 tensor([[9, 9, 2, 0],
         [9, 9, 2, 0],
         [9, 9, 2, 0],
         [9, 9, 2, 0],
         [9, 9, 2, 0]]))

In [208]:
net(batch_t)

tensor([[[ 0.0250,  0.1825, -0.0726, -0.0624, -0.1122, -0.2633,  0.1887,
          -0.1222, -0.1818, -0.0588, -0.1243, -0.2717,  0.2647, -0.0008,
          -0.0827],
         [ 0.0737,  0.0620, -0.1059,  0.0104,  0.0229, -0.3546,  0.2105,
           0.0421, -0.3234, -0.0183, -0.0643, -0.2593,  0.1713, -0.0112,
          -0.1340],
         [ 0.1400,  0.0787, -0.1382,  0.0712,  0.0991, -0.3531,  0.1966,
           0.0918, -0.2964,  0.0100, -0.0681, -0.2550,  0.1680, -0.0317,
          -0.1518],
         [ 0.1828,  0.1128, -0.0863,  0.2503,  0.2203, -0.1676,  0.2259,
           0.0418, -0.1701,  0.0978,  0.1026, -0.0681,  0.0622,  0.0215,
          -0.2344]],

        [[ 0.0250,  0.1825, -0.0726, -0.0624, -0.1122, -0.2633,  0.1887,
          -0.1222, -0.1818, -0.0588, -0.1243, -0.2717,  0.2647, -0.0008,
          -0.0827],
         [ 0.0737,  0.0620, -0.1059,  0.0104,  0.0229, -0.3546,  0.2105,
           0.0421, -0.3234, -0.0183, -0.0643, -0.2593,  0.1713, -0.0112,
          -0.1340],
  

In [209]:
for i, (x_batch, y_batch) in enumerate(batch_generator(x_train[:5])):
    print(i)
    print(x_batch)
    print(y_batch)

AttributeError: 'numpy.ndarray' object has no attribute 'insert'

In [97]:
def sample(lnprobs, temperature=1.0):
    """
     Sample an element from a categorical distribution
     :param lnprobs: Outcome logits
     :param temperature: Sampling temperature. 1.0 follows the given distribution, 0.0 returns the maximum probability element.
     :return: The index of the sampled element.
    """
    if temperature == 0.0:
             return lnprobs.argmax()

    p = F.softmax(lnprobs / temperature, dim=0)
    cd = dist.Categorical(p)
    return cd.sample()


In [98]:
def predict(dataset, model, seq, temperature=1.0, max_length=20):
    """
    :param dataset: need i2w and w2i
    :param model: the model we sample from
    :param seq: the sequence of tokens we want to complete
    :param max_length: we stop if we reach an end token, or after max_length tokens
    :return: the generated sequence of tokens
    """
    model.eval()
    pred = []
    for i in range(0, max_length):
        x = torch.tensor([[dataset.w2i[i] for w in seq[i:]]])
        y = model.forward(x)
        last_token_logits = y[0][-1]
        j = sample(last_token_logits, temperature)
        pred.append(seq.dataset.i2w[j])
        if seq.dataset.i2w[j] == '.end':
            return pred
    return pred

In [105]:
# For Google Colab
device = torch.device('cuda' if torch.has_cuda else 'cpu')
net.to(device)
print(f"Using {device} device")

Using cpu device


In [106]:
def train(model, dataset, epochs=3, batch_size=128, learning_rate=0.001):

    # Loss function:
    # check whether the loss function applies softmax or whether we need to do it manually
    # loss function = cross entropy loss at every point in time, read doc to figure out
    # how to shuffle dimensions properly
    criterion = nn.CrossEntropyLoss()

    # Optimizer:
    optimizer = optim.Adam(net.parameters(), lr=0.001)

    metrics = {
        'loss_history': [],
        'loss_train': []
    }

    # Training loop
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}\n-------------------------------")
        start_time = time()
        running_loss = 0.0
        total_loss = 0.0

        model.train()
        for batch, (X, y) in enumerate(batch_generator(x_train)):
            X, y = X.to(device), y.to(device)

            # Compute prediction error
            pred = model(X)
            loss = criterion(pred, y)

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            finish_time = time()

            # print statistics
            running_loss += loss.item()
            total_loss += loss.item()
            if i % 20 == 19:    # print every 2000 mini-batches
                print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 20:.3f} time: {finish_time - start_time:.3f}')
                metrics['loss_history'].append(running_loss / 20)
                running_loss = 0.0
        metrics['loss_train'].append(total_loss / len(x_train))


        print("Predicting:")
        model.eval()
        seq = ['.start', 'a', 'b']
        predict(model, dataset, seq, max_length=20)

In [107]:
metrics = train(net, x_train, epochs=5, batch_size=128, learning_rate=0.001)

Epoch 1
-------------------------------


  data = np.array(data)
  targets = torch.tensor(upd_batch, dtype=torch.long)[:, 1:]


RuntimeError: Expected target size [128, 15], got [128, 56]