## Toy Problem


I can't get my Transformer to work! Let's make the problem even simpler.

Given a sequence of numbers, simply reverse the sequence.

```
input = 0 1 5 9 0 3 5 2 5
reversed = 5 2 5 3 0 9 5 1 0
```

In [1]:
import torch
import random
import numpy as np

from torch.utils.data import Dataset, DataLoader
from transformers import BertConfig, BertModel

In [2]:
def generate_data(num_examples: int, seq_len: int, vocab_size: int):
    inputs = np.random.randint(0, vocab_size, size=(num_examples, seq_len))
    outputs = np.ascontiguousarray(np.flip(inputs, 1)) #PyTorch can't handle negative strides

    return inputs, outputs

In [3]:
generate_data(1, 10, 10)

(array([[8, 6, 9, 3, 5, 1, 2, 3, 5, 4]]),
 array([[4, 5, 3, 2, 1, 5, 3, 9, 6, 8]]))

In [4]:
class ToyDataset(Dataset):
   
    def __init__(self, num_examples, sequence_length, vocab_size):
        self.items, self.labels = generate_data(num_examples, sequence_length, vocab_size)
        
    def __getitem__(self, idx):
        
        x = torch.Tensor(self.items[idx]).long()
        y = torch.Tensor(self.labels[idx]).long()
        return x.cuda(), y.cuda()
    
    
    def __len__(self):
        return len(self.items)


In [69]:
NUM_EXAMPLES = 1000
SEQUENCE_LENGTH = 9
VOCAB_SIZE = 10
BATCH_SIZE = 64

In [70]:
train_ds = ToyDataset(num_examples=NUM_EXAMPLES, sequence_length=SEQUENCE_LENGTH, vocab_size=VOCAB_SIZE)
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE)

In [71]:
class ToyModel(torch.nn.Module):
    """
    Wrapper around a BERT model that predicts a single binary output for each input token
    """
    
    def __init__(self, vocab_size):
        super().__init__()
        
        # Untrained BERT Model
        config = BertConfig(vocab_size_or_config_json_file=vocab_size)
        self.bert_model = BertModel(config)
        self.linear = torch.nn.Linear(in_features=768, out_features=vocab_size)
        
    def forward(self, x):    
        out, _ = self.bert_model(x)
        out = self.linear(out)
        return out    

In [72]:
model = ToyModel(VOCAB_SIZE)
model = model.cuda()

In [73]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
loss_fn = torch.nn.CrossEntropyLoss()

In [74]:
NUM_EPOCHS = 10

for i in range(NUM_EPOCHS):
    
    for train_step, (x, y) in enumerate(train_dl):
        
        model.zero_grad()
        output = model(x)
        
        loss = loss_fn(output.view((-1, output.size(-1))), y.view(-1))
        
        #print(output[0][:10])
        if train_step % 100 == 0:
            print(i, train_step, loss)
        
        #time.sleep(0.5)
        
        loss.backward()
        optimizer.step()

0 0 tensor(2.4545, device='cuda:0', grad_fn=<NllLossBackward>)
1 0 tensor(2.3095, device='cuda:0', grad_fn=<NllLossBackward>)
2 0 tensor(2.2962, device='cuda:0', grad_fn=<NllLossBackward>)
3 0 tensor(2.2780, device='cuda:0', grad_fn=<NllLossBackward>)
4 0 tensor(2.3026, device='cuda:0', grad_fn=<NllLossBackward>)
5 0 tensor(2.2908, device='cuda:0', grad_fn=<NllLossBackward>)
6 0 tensor(2.2982, device='cuda:0', grad_fn=<NllLossBackward>)
7 0 tensor(2.0922, device='cuda:0', grad_fn=<NllLossBackward>)
8 0 tensor(1.7987, device='cuda:0', grad_fn=<NllLossBackward>)
9 0 tensor(0.0519, device='cuda:0', grad_fn=<NllLossBackward>)


In [43]:
def get_output_for_example(x):
    
    logits = model(x.unsqueeze(0))
    probs = torch.softmax(logits, dim=2)
    out = torch.argmax(probs, 2)
    
    return out 

In [44]:
x, y = train_ds[0]
y_hat = get_output_for_example(x)

print("X:\t", x)
print("y:\t", y)
print("y_hat:\t", y_hat.squeeze())

X:	 tensor([4, 6, 5, 0, 0, 5, 3, 5, 0, 1, 7], device='cuda:0')
y:	 tensor([7, 1, 0, 5, 3, 5, 0, 0, 5, 6, 4], device='cuda:0')
y_hat:	 tensor([0, 0, 5, 0, 0, 5, 5, 5, 0, 5, 5], device='cuda:0')


In [30]:
x = torch.from_numpy(np.arange(SEQUENCE_LENGTH)).long().cuda()
y = torch.flip(x, dims=(0,))
y_hat = get_output_for_example(x)

print("X:\t", x)
print("y:\t", y)
print("y_hat:\t", y_hat.squeeze())

X:	 tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], device='cuda:0')
y:	 tensor([9, 8, 7, 6, 5, 4, 3, 2, 1, 0], device='cuda:0')
y_hat:	 tensor([9, 8, 7, 6, 5, 4, 3, 2, 1, 0], device='cuda:0')
