In [1]:
from data_rnn import load_ndfa
import matplotlib.pyplot as plt
import numpy as np
import random
from time import time

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch.optim as optim
import torch.distributions as dist

In [3]:
# NDFA
x_train, (i2w, w2i) = load_ndfa(n=150_000)

# # Brackets
# x_train, (i2w, w2i) = load_brackets(n=150_000)

In [4]:
print(''.join([i2w[i] for i in  x_train[10_000]]) )

ss


In [7]:
vocab_size = len(w2i)
vocab_size

15

In [8]:
w2i

{'.pad': 0,
 '.start': 1,
 '.end': 2,
 '.unk': 3,
 '!': 4,
 'b': 5,
 'v': 6,
 's': 7,
 'c': 8,
 'u': 9,
 'a': 10,
 'm': 11,
 'w': 12,
 'k': 13,
 'l': 14}

In [9]:
def get_max_seq_length(batch):
    return max(len(seq) for seq in batch)

In [10]:
def get_sum_seq_lengths(batch):
    return sum(len(seq) for seq in batch)

In [11]:
def add_padding(seq, amt=1):
    for _ in range(amt):
        seq.append(w2i['.pad'])
    return seq

In [12]:
def add_start(seq):
    seq.insert(0, w2i['.start'])
    return seq

In [13]:
def add_end(seq):
    seq.append(w2i['.end'])
    return seq

In [14]:
get_max_seq_length(x_train)

158

In [15]:
get_sum_seq_lengths(x_train) / len(x_train)

13.96264

In [16]:
def preprocess_batch(batch):
    max_len = get_max_seq_length(batch)
    
    upd_batch = []
    for i, _ in enumerate(batch):
        seq = batch[i].copy()
        seq = add_start(seq)
        seq = add_end(seq)
        seq = add_padding(seq, amt=max_len + 2 - len(seq))
        upd_batch.append(seq)
    
    upd_batch = torch.tensor(upd_batch, dtype=torch.long)
    # targets = torch.tensor(targets, dtype=torch.long)
    
    # return upd_batch, targets
    return upd_batch

In [56]:
def get_batches(data, vocab_size, token_amt=12500):
    batches = []
    targets = []

    bound_l, bound_r = -1, 0
    idx = 0
    while bound_r < len(data):
        bound_l = bound_r

        num_of_tokens = 0
        while (num_of_tokens + len(data[idx])) < token_amt and bound_r < len(data):
            num_of_tokens += len(data[idx])
            bound_r += 1
        print(bound_l, bound_r)
        batch = data[bound_l:bound_r]
        batch = preprocess_batch(batch)
        target = F.pad(batch, (0, 1), "constant", 0)[:, 1:]
        target = F.one_hot(target, num_classes=vocab_size)

        batches.append(batch)
        targets.append(target)
    return batches, targets

In [57]:
batches, targets = get_batches(x_train, vocab_size)

0 6249
6249 12498
12498 18747
18747 24996
24996 31245
31245 37494
37494 43743
43743 49992
49992 56241
56241 62490
62490 68739
68739 74988
74988 81237
81237 87486
87486 93735
93735 99984
99984 106233
106233 112482
112482 118731
118731 124980
124980 131229
131229 137478
137478 143727
143727 149976
149976 150000


In [58]:
batches[0][:10]

tensor([[1, 7, 7, 2],
        [1, 7, 7, 2],
        [1, 7, 7, 2],
        [1, 7, 7, 2],
        [1, 7, 7, 2],
        [1, 7, 7, 2],
        [1, 7, 7, 2],
        [1, 7, 7, 2],
        [1, 7, 7, 2],
        [1, 7, 7, 2]])

In [59]:
targets[0][:10]

tensor([[[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],

        [[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],

        [[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],

        [[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],

        [[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 

### Question 5

In [60]:
class Net(nn.Module):
    def __init__(self,
                 vocab_size,
                 embedding_size,
                 hidden_size,
                 lstm_num_layers) -> None:
        super().__init__()

        self.embed = nn.Embedding(vocab_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, lstm_num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        input = self.embed(x)
        lstm_output, (hn, cn) = self.lstm(input)
        output = self.linear(lstm_output)
        return output

In [82]:
vocab_size=len(w2i)
embedding_size = 32
hidden_size = 16
lstm_num_layers = 1

epochs = 3
learning_rate = 0.001
device='cpu'


In [62]:
net = Net(vocab_size, embedding_size, hidden_size, lstm_num_layers)

In [63]:
net

Net(
  (embed): Embedding(15, 32)
  (lstm): LSTM(32, 16, batch_first=True)
  (linear): Linear(in_features=16, out_features=15, bias=True)
)

In [64]:
def batch_generator(batches, targets):
    indices = list(range(len(batches)))
    random.shuffle(indices)
    for ind in indices:
        yield batches[ind], targets[ind]

In [75]:
outputs = net(batches[0])
outputs.shape

torch.Size([6249, 4, 15])

In [77]:
batches[0].size()[-1]

4

In [66]:
criterion = nn.CrossEntropyLoss(reduction='sum')

In [67]:
outputs[0].dtype

torch.float32

In [68]:
targets[0][0].dtype

torch.int64

In [80]:
targets[0].size()[-1]

15

In [71]:
loss = criterion(outputs, targets[0].type(torch.float64))
loss

tensor(34548.4894, dtype=torch.float64, grad_fn=<NegBackward0>)

In [87]:
def train(model, batches, targets, epochs=3, learning_rate=0.001, device='cpu'):
    # Loss function:
    # check whether the loss function applies softmax or whether we need to do it manually
    # loss function = cross entropy loss at every point in time, read doc to figure out
    # how to shuffle dimensions properly
    criterion = nn.CrossEntropyLoss(reduction='sum') # 'sum': the output will be summed, since we want loss on every step

    # Optimizer:
    optimizer = optim.Adam(net.parameters(), lr=learning_rate)

    metrics = {
        'loss_history': [],
        'loss_train': []
    }

    # Training loop
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}\n-------------------------------")
        start_time = time()
        total_loss = 0.0

        model.train()
        for i, (X, y) in enumerate(batch_generator(batches, targets)):
            X, y = X.to(device), y.to(device)

            # Compute prediction error
            outputs = model(X)
            loss = criterion(outputs, y.type(torch.float64))

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            finish_time = time()

            # print statistics
            total_loss += loss.item() / y.size()[0] / y.size()[-1]
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {loss.item() / y.size()[0] / y.size()[-1]:.3f} time: {finish_time - start_time:.3f}')
            metrics['loss_history'].append(loss.item())

        metrics['loss_train'].append(total_loss / len(x_train))

    return model, metrics


        # print("Predicting:")
        # model.eval()
        # seq = ['.start', 'a', 'b']
        # predict(model, dataset, seq, max_length=20)

In [88]:
model = Net(vocab_size, embedding_size, hidden_size, lstm_num_layers)
model, metrics = train(model, batches, targets, epochs, learning_rate, device)

Epoch 1
-------------------------------


AttributeError: 'list' object has no attribute 'size'