## Toy Problem 1


I can't get my Transformer to work! Let's make the problem even simpler.

Given a sequence of numbers, simply reverse the sequence.

```
input = 0 1 5 9 0 3 5 2 5
reversed = 5 2 5 3 0 9 5 1 0
```

In [1]:
import torch
import random
import numpy as np

from torch.utils.data import Dataset, DataLoader
from transformers import BertConfig, BertModel

from radam import RAdam
from utils import get_accuracy, get_output_for_example, train

In [2]:
def generate_data(num_examples: int, seq_len: int, vocab_size: int):
    inputs = np.random.randint(0, vocab_size, size=(num_examples, seq_len))
    outputs = np.ascontiguousarray(np.flip(inputs, 1)) #PyTorch can't handle negative strides

    return inputs, outputs

In [3]:
generate_data(1, 10, 10)

(array([[3, 1, 2, 7, 6, 9, 1, 2, 6, 7]]),
 array([[7, 6, 2, 1, 9, 6, 7, 2, 1, 3]]))

In [4]:
class ToyDataset(Dataset):
   
    def __init__(self, num_examples, sequence_length, vocab_size):
        self.items, self.labels = generate_data(num_examples, sequence_length, vocab_size)
        
    def __getitem__(self, idx):
        
        x = torch.Tensor(self.items[idx]).long()
        y = torch.Tensor(self.labels[idx]).long()
        return x.cuda(), y.cuda()
    
    def __len__(self):
        return len(self.items)

In [5]:
SEQ_LENGTH = 5
VOCAB_SIZE = 10

TRN_EXAMPLES = 25
VAL_EXAMPLES = 10
BATCH_SIZE = 2
LR = 1e-4

In [6]:
train_ds = ToyDataset(num_examples=TRN_EXAMPLES, sequence_length=SEQ_LENGTH, vocab_size=VOCAB_SIZE)
valid_ds = ToyDataset(num_examples=VAL_EXAMPLES, sequence_length=SEQ_LENGTH, vocab_size=VOCAB_SIZE)
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE)
valid_dl = DataLoader(train_ds, batch_size=BATCH_SIZE)

In [7]:
class ToyModel(torch.nn.Module):
    """
    Wrapper around a BERT model
    """
    
    def __init__(self, vocab_size):
        super().__init__()
        
        # Untrained BERT Model
        config = BertConfig(vocab_size_or_config_json_file=vocab_size)
        self.bert_model = BertModel(config)
        self.linear = torch.nn.Linear(in_features=768, out_features=vocab_size)
        
    def forward(self, x):    
        out, _ = self.bert_model(x)
        out = self.linear(out)
        return out    

In [8]:
model = ToyModel(VOCAB_SIZE)
model = model.cuda()

In [9]:
# NOTE: We use RAdam to avoid having to use warmup
# If we use regular Adam, this usually won't converge for long sequences
optimizer = RAdam(model.parameters(), lr=LR)
loss_fn = torch.nn.CrossEntropyLoss()

In [12]:
train(model, train_dl, valid_dl, loss_fn, optimizer, num_epochs=10, print_every=100)

Epoch:	 0 	Step:	 0 	Loss:	 0.47337642312049866
Epoch:	 0 			Valid Accuracy	 0.92307687
Epoch:	 1 	Step:	 0 	Loss:	 0.5742489099502563
Epoch:	 1 			Valid Accuracy	 0.99230766
Epoch:	 2 	Step:	 0 	Loss:	 0.35289257764816284
Epoch:	 2 			Valid Accuracy	 0.97692305
Epoch:	 3 	Step:	 0 	Loss:	 0.20394399762153625
Epoch:	 3 			Valid Accuracy	 0.99230766
Epoch:	 4 	Step:	 0 	Loss:	 0.14109757542610168
Epoch:	 4 			Valid Accuracy	 0.99230766
Epoch:	 5 	Step:	 0 	Loss:	 0.12724749743938446
Epoch:	 5 			Valid Accuracy	 1.0
Epoch:	 6 	Step:	 0 	Loss:	 0.0702083557844162
Epoch:	 6 			Valid Accuracy	 1.0
Epoch:	 7 	Step:	 0 	Loss:	 0.0712471455335617
Epoch:	 7 			Valid Accuracy	 1.0
Epoch:	 8 	Step:	 0 	Loss:	 0.05714692920446396
Epoch:	 8 			Valid Accuracy	 1.0
Epoch:	 9 	Step:	 0 	Loss:	 0.036658428609371185
Epoch:	 9 			Valid Accuracy	 1.0


In [13]:
x, y = train_ds[0]
y_hat = get_output_for_example(model, x)

print("X:\t", x)
print("y:\t", y)
print("y_hat:\t", y_hat.squeeze())

X:	 tensor([3, 8, 6, 7, 7], device='cuda:0')
y:	 tensor([7, 7, 6, 8, 3], device='cuda:0')
y_hat:	 tensor([7, 7, 6, 8, 3], device='cuda:0')


In [14]:
x = torch.from_numpy(np.arange(SEQ_LENGTH)).long().cuda()
y = torch.flip(x, dims=(0,))
y_hat = get_output_for_example(model, x)

print("X:\t", x)
print("y:\t", y)
print("y_hat:\t", y_hat.squeeze())

X:	 tensor([0, 1, 2, 3, 4], device='cuda:0')
y:	 tensor([4, 3, 2, 1, 0], device='cuda:0')
y_hat:	 tensor([1, 0, 2, 4, 3], device='cuda:0')
