## Toy Problem 1


I can't get my Transformer to work! Let's make the problem even simpler.

Given a sequence of numbers, simply reverse the sequence.

```
input = 0 1 5 9 0 3 5 2 5
reversed = 5 2 5 3 0 9 5 1 0
```

In [69]:
import torch
import random
import numpy as np

from torch.utils.data import Dataset, DataLoader
from transformers import BertConfig, BertModel

from radam import RAdam

In [2]:
def generate_data(num_examples: int, seq_len: int, vocab_size: int):
    inputs = np.random.randint(0, vocab_size, size=(num_examples, seq_len))
    outputs = np.ascontiguousarray(np.flip(inputs, 1)) #PyTorch can't handle negative strides

    return inputs, outputs

In [3]:
generate_data(1, 10, 10)

(array([[2, 6, 3, 7, 7, 5, 8, 2, 4, 2]]),
 array([[2, 4, 2, 8, 5, 7, 7, 3, 6, 2]]))

In [4]:
class ToyDataset(Dataset):
   
    def __init__(self, num_examples, sequence_length, vocab_size):
        self.items, self.labels = generate_data(num_examples, sequence_length, vocab_size)
        
    def __getitem__(self, idx):
        
        x = torch.Tensor(self.items[idx]).long()
        y = torch.Tensor(self.labels[idx]).long()
        return x.cuda(), y.cuda()
    
    
    def __len__(self):
        return len(self.items)


In [13]:
NUM_EXAMPLES = 5
SEQUENCE_LENGTH = 100
VOCAB_SIZE = 10
BATCH_SIZE = 64

In [14]:
train_ds = ToyDataset(num_examples=NUM_EXAMPLES, sequence_length=SEQUENCE_LENGTH, vocab_size=VOCAB_SIZE)
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE)

In [15]:
class ToyModel(torch.nn.Module):
    """
    Wrapper around a BERT model that predicts a single binary output for each input token
    """
    
    def __init__(self, vocab_size):
        super().__init__()
        
        # Untrained BERT Model
        config = BertConfig(vocab_size_or_config_json_file=vocab_size)
        self.bert_model = BertModel(config)
        self.linear = torch.nn.Linear(in_features=768, out_features=vocab_size)
        
    def forward(self, x):    
        out, _ = self.bert_model(x)
        out = self.linear(out)
        return out    

In [16]:
model = ToyModel(VOCAB_SIZE)
model = model.cuda()

In [17]:
# NOTE: We use RAdam to avoid having to use warmup
# If we use regular Adam, this usually won't converge for long sequences
optimizer = RAdam(model.parameters(), lr=1e-4)
loss_fn = torch.nn.CrossEntropyLoss()

In [19]:
NUM_EPOCHS = 50

for i in range(NUM_EPOCHS):
    
    for train_step, (x, y) in enumerate(train_dl):
        
        model.zero_grad()
        output = model(x)
        
        loss = loss_fn(output.view((-1, output.size(-1))), y.view(-1))
        
        #print(output[0][:10])
        if train_step % 100 == 0:
            print(i, train_step, loss)
        
        #time.sleep(0.5)
        
        loss.backward()
        optimizer.step()

0 0 tensor(1.8428, device='cuda:0', grad_fn=<NllLossBackward>)
1 0 tensor(1.8082, device='cuda:0', grad_fn=<NllLossBackward>)
2 0 tensor(1.7703, device='cuda:0', grad_fn=<NllLossBackward>)
3 0 tensor(1.7303, device='cuda:0', grad_fn=<NllLossBackward>)
4 0 tensor(1.7028, device='cuda:0', grad_fn=<NllLossBackward>)
5 0 tensor(1.6839, device='cuda:0', grad_fn=<NllLossBackward>)
6 0 tensor(1.6357, device='cuda:0', grad_fn=<NllLossBackward>)
7 0 tensor(1.6156, device='cuda:0', grad_fn=<NllLossBackward>)
8 0 tensor(1.5824, device='cuda:0', grad_fn=<NllLossBackward>)
9 0 tensor(1.5314, device='cuda:0', grad_fn=<NllLossBackward>)
10 0 tensor(1.4967, device='cuda:0', grad_fn=<NllLossBackward>)
11 0 tensor(1.4975, device='cuda:0', grad_fn=<NllLossBackward>)
12 0 tensor(1.4305, device='cuda:0', grad_fn=<NllLossBackward>)
13 0 tensor(1.4080, device='cuda:0', grad_fn=<NllLossBackward>)
14 0 tensor(1.3795, device='cuda:0', grad_fn=<NllLossBackward>)
15 0 tensor(1.3393, device='cuda:0', grad_fn=<NllL

In [20]:
def get_output_for_example(x):
    
    logits = model(x.unsqueeze(0))
    probs = torch.softmax(logits, dim=2)
    out = torch.argmax(probs, 2)
    
    return out 

In [21]:
x, y = train_ds[0]
y_hat = get_output_for_example(x)

print("X:\t", x)
print("y:\t", y)
print("y_hat:\t", y_hat.squeeze())

X:	 tensor([6, 3, 5, 3, 0, 2, 2, 0, 6, 7, 1, 8, 4, 0, 4, 4, 2, 4, 1, 3, 0, 9, 6, 8,
        1, 7, 2, 3, 2, 4, 0, 2, 3, 3, 9, 1, 5, 9, 1, 3, 8, 7, 5, 1, 5, 7, 2, 3,
        7, 6, 1, 5, 0, 7, 4, 7, 7, 1, 5, 5, 0, 6, 7, 4, 0, 9, 4, 4, 5, 7, 9, 8,
        1, 6, 1, 8, 1, 0, 4, 1, 0, 3, 1, 2, 7, 1, 0, 9, 1, 1, 5, 0, 9, 8, 5, 2,
        6, 3, 0, 0], device='cuda:0')
y:	 tensor([0, 0, 3, 6, 2, 5, 8, 9, 0, 5, 1, 1, 9, 0, 1, 7, 2, 1, 3, 0, 1, 4, 0, 1,
        8, 1, 6, 1, 8, 9, 7, 5, 4, 4, 9, 0, 4, 7, 6, 0, 5, 5, 1, 7, 7, 4, 7, 0,
        5, 1, 6, 7, 3, 2, 7, 5, 1, 5, 7, 8, 3, 1, 9, 5, 1, 9, 3, 3, 2, 0, 4, 2,
        3, 2, 7, 1, 8, 6, 9, 0, 3, 1, 4, 2, 4, 4, 0, 4, 8, 1, 7, 6, 0, 2, 2, 0,
        3, 5, 3, 6], device='cuda:0')
y_hat:	 tensor([0, 0, 3, 6, 2, 5, 5, 9, 0, 5, 1, 1, 9, 0, 1, 7, 2, 1, 3, 4, 1, 4, 0, 5,
        8, 3, 5, 1, 0, 9, 1, 5, 4, 4, 9, 9, 4, 7, 6, 0, 5, 5, 1, 7, 7, 4, 7, 0,
        2, 1, 6, 7, 3, 2, 7, 5, 7, 5, 7, 8, 3, 0, 9, 5, 1, 7, 3, 3, 2, 5, 4, 2,
        3, 2, 7, 1, 8, 6, 9,

In [None]:
x = torch.from_numpy(np.arange(SEQUENCE_LENGTH)).long().cuda()
y = torch.flip(x, dims=(0,))
y_hat = get_output_for_example(x)

print("X:\t", x)
print("y:\t", y)
print("y_hat:\t", y_hat.squeeze())

## Toy Problem 2

Now that we have the first problem mostly working, let's try another.


Given a sequence of numbers, find and mark adjacent duplicates. For simplicity, we'll only have one duplicate in each input.

```
input =    0 3 5 9 3 3 5 2 5
reversed = 0 0 0 0 1 1 0 0 0
```


In [64]:
def generate_data(num_examples: int, seq_len: int, vocab_size: int):
    inputs = np.random.randint(0, vocab_size, size=(num_examples, seq_len))
    outputs = np.zeros_like(inputs)
    
    for i in range(len(inputs)):
        
        # choose random location to introduce duplicate
        location = np.random.randint(0, len(inputs[i]))
        
        left_or_right = np.random.random()
        if left_or_right < 0.5 and location > 0:
            dup_location = location - 1
        elif left_or_right > 0.5 and location < len(inputs[i]) - 1:
            dup_location = location + 1
        elif location == 0:
            dup_location = location + 1
        elif location == len(inputs[i]) - 1:
            dup_location = location - 1
        else:
            print("location", location)
            print("len of inputs[i]", len(inputs[i]))
            print("left or right", left_or_right)
            raise Exception("This should be unreachable...")
            
        inputs[i][location] = inputs[i][dup_location]
        
        # Mark location where duplicates exist
        outputs[i][location] = 1
        outputs[i][dup_location] = 1

    return inputs, outputs

In [67]:
# We're lazy so we're going to just use a large vocabulary 
# instead of manually removing accidental duplicates
generate_data(num_examples=1, seq_len=10, vocab_size=1000)

(array([[235, 247, 514, 522, 164, 540, 613, 985, 450, 450]]),
 array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1]]))

In [70]:
class ToyDataset(Dataset):
   
    def __init__(self, num_examples, sequence_length, vocab_size):
        self.items, self.labels = generate_data(num_examples, sequence_length, vocab_size)
        
    def __getitem__(self, idx):
        
        x = torch.Tensor(self.items[idx]).long()
        y = torch.Tensor(self.labels[idx]).long()
        return x.cuda(), y.cuda()
    
    
    def __len__(self):
        return len(self.items)


In [130]:
NUM_EXAMPLES = 10000
SEQUENCE_LENGTH = 10
VOCAB_SIZE = 1000
BATCH_SIZE = 64

train_ds = ToyDataset(num_examples=NUM_EXAMPLES, sequence_length=SEQUENCE_LENGTH, vocab_size=VOCAB_SIZE)
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE)

In [131]:
train_ds[0:10]

(tensor([[817, 831, 314, 331, 855, 339, 339, 176, 606, 130],
         [204, 233, 471, 471, 538, 475, 355, 958, 378, 967],
         [930, 963, 420, 420, 950, 947, 934, 835, 983, 151],
         [291, 991, 321, 242, 160, 625, 251, 251, 539,  50],
         [725, 887,  87, 837, 953, 472, 816, 671, 671, 929],
         [ 17,  50, 168, 168, 340, 673, 146, 862, 795, 119],
         [213, 858, 858, 999, 972, 158, 906, 913, 924, 307],
         [298, 816, 750, 750, 973, 564, 975, 400, 978, 948],
         [450, 311, 354, 285, 140, 267, 267, 877, 693, 644],
         [816, 964, 394, 184, 286, 286, 196, 932, 327, 579]], device='cuda:0'),
 tensor([[0, 0, 0, 0, 0, 1, 1, 0, 0, 0],
         [0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
         [0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 1, 1, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 1, 1, 0],
         [0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
         [0, 1, 1, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 1, 1, 0, 0, 0],
  

In [132]:
class ToyModel(torch.nn.Module):
    """
    Wrapper around a BERT model that predicts a single binary output for each input token
    """
    
    def __init__(self, vocab_size):
        super().__init__()
        
        # Untrained BERT Model
        config = BertConfig(vocab_size_or_config_json_file=vocab_size)
        self.bert_model = BertModel(config)
        self.linear = torch.nn.Linear(in_features=768, out_features=2)
        
    def forward(self, x):    
        out, _ = self.bert_model(x)
        out = self.linear(out)
        return out    

In [136]:
model = ToyModel(VOCAB_SIZE)
model = model.cuda()

In [137]:
# NOTE: We use RAdam to avoid having to use warmup
# If we use regular Adam, this usually won't converge for long sequences
optimizer = RAdam(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

In [144]:
NUM_EPOCHS = 10

for i in range(NUM_EPOCHS):
    
    for train_step, (x, y) in enumerate(train_dl):
        
        model.zero_grad()
        output = model(x)
        
        loss = loss_fn(output.view((-1, output.size(-1))), y.view(-1))
        
        #print(output[0][:10])
        if train_step % 100 == 0:
            print(i, train_step, loss)
               
        loss.backward()
        optimizer.step()

0 0 tensor(0.0294, device='cuda:0', grad_fn=<NllLossBackward>)
0 100 tensor(0.0138, device='cuda:0', grad_fn=<NllLossBackward>)
1 0 tensor(0.0172, device='cuda:0', grad_fn=<NllLossBackward>)
1 100 tensor(0.0173, device='cuda:0', grad_fn=<NllLossBackward>)
2 0 tensor(0.0360, device='cuda:0', grad_fn=<NllLossBackward>)
2 100 tensor(0.0350, device='cuda:0', grad_fn=<NllLossBackward>)
3 0 tensor(0.0121, device='cuda:0', grad_fn=<NllLossBackward>)
3 100 tensor(0.0218, device='cuda:0', grad_fn=<NllLossBackward>)
4 0 tensor(0.0063, device='cuda:0', grad_fn=<NllLossBackward>)
4 100 tensor(0.0221, device='cuda:0', grad_fn=<NllLossBackward>)
5 0 tensor(0.0106, device='cuda:0', grad_fn=<NllLossBackward>)
5 100 tensor(0.0062, device='cuda:0', grad_fn=<NllLossBackward>)
6 0 tensor(0.0105, device='cuda:0', grad_fn=<NllLossBackward>)
6 100 tensor(0.0037, device='cuda:0', grad_fn=<NllLossBackward>)
7 0 tensor(0.0056, device='cuda:0', grad_fn=<NllLossBackward>)
7 100 tensor(0.0038, device='cuda:0', gra

In [145]:
def get_output_for_example(x):
    
    logits = model(x.unsqueeze(0))
    probs = torch.softmax(logits, dim=2)
    out = torch.argmax(probs, 2)
    
    return out 

In [146]:
x, y = train_ds[0]
y_hat = get_output_for_example(x)

print("X:\t", x)
print("y:\t", y)
print("y_hat:\t", y_hat.squeeze())

X:	 tensor([817, 831, 314, 331, 855, 339, 339, 176, 606, 130], device='cuda:0')
y:	 tensor([0, 0, 0, 0, 0, 1, 1, 0, 0, 0], device='cuda:0')
y_hat:	 tensor([0, 0, 0, 0, 0, 1, 1, 0, 0, 0], device='cuda:0')


In [148]:
x = torch.from_numpy(np.arange(SEQUENCE_LENGTH)).long().cuda() * 100
x[1] = 0
y = torch.zeros_like(x)
y[0] = 1
y[1] = 1
y_hat = get_output_for_example(x)

print("X:\t", x)
print("y:\t", y)
print("y_hat:\t", y_hat.squeeze())

X:	 tensor([  0,   0, 200, 300, 400, 500, 600, 700, 800, 900], device='cuda:0')
y:	 tensor([1, 1, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
y_hat:	 tensor([1, 1, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
