## Toy Problem 3

Now that we have the first problem mostly working, let's try another.


Given a sequence of numbers, find and mark adjacent duplicates. Duplicates can be of length 2-5.

```
input  = 0 3 5 9 3 3 5 2 5
output = 0 0 0 0 1 1 0 0 0

input  = 1 2 2 2 2 2 3 1 5
output = 0 1 1 1 1 1 0 0 0

```


In [15]:
pessimisticdef generate_data(num_examples: int, seq_len: int, vocab_size: int):
    inputs = np.random.randint(0, vocab_size, size=(num_examples, seq_len))
    outputs = np.zeros_like(inputs)
    
    for i in range(len(inputs)):
        
        # choose number of duplicates to introduce
        num_duplicates = np.random.randint(3, 10)
        
        # choose random location to introduce our duplicates
        location = np.random.randint(0, len(inputs[i]) - num_duplicates)
        
        # Choose what number we'd like to repeat
        number_to_repeat = np.random.randint(0, vocab_size)
        
        inputs[i][location:location + num_duplicates] = number_to_repeat
        
        # Mark location where duplicates exist
        outputs[i][location:location + num_duplicates] = 1
        

    return inputs, outputs

In [16]:
# We're lazy so we're going to just use a large vocabulary 
# instead of manually removing accidental duplicates
generate_data(num_examples=1, seq_len=10, vocab_size=1000)

(array([[668, 750, 750, 750, 750, 750, 256, 436, 632, 263]]),
 array([[0, 1, 1, 1, 1, 1, 0, 0, 0, 0]]))

In [17]:
class ToyDataset(Dataset):
   
    def __init__(self, num_examples, sequence_length, vocab_size):
        self.items, self.labels = generate_data(num_examples, sequence_length, vocab_size)
        
    def __getitem__(self, idx):
        
        x = torch.Tensor(self.items[idx]).long()
        y = torch.Tensor(self.labels[idx]).long()
        return x.cuda(), y.cuda()
    
    def __len__(self):
        return len(self.items)

In [18]:
NUM_EXAMPLES = 10000
SEQUENCE_LENGTH = 10
VOCAB_SIZE = 100
BATCH_SIZE = 64

train_ds = ToyDataset(num_examples=NUM_EXAMPLES, sequence_length=SEQUENCE_LENGTH, vocab_size=VOCAB_SIZE)
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE)

In [19]:
class ToyModel(torch.nn.Module):
    """
    Wrapper around a BERT model that predicts a single binary output for each input token
    """
    
    def __init__(self, vocab_size):
        super().__init__()
        
        # Untrained BERT Model
        config = BertConfig(vocab_size_or_config_json_file=vocab_size)
        self.bert_model = BertModel(config)
        self.linear = torch.nn.Linear(in_features=768, out_features=2)
        
    def forward(self, x):    
        out, _ = self.bert_model(x)
        out = self.linear(out)
        return out    

In [20]:
model = ToyModel(VOCAB_SIZE)
model = model.cuda()

In [21]:
# NOTE: We use RAdam to avoid having to use warmup
# If we use regular Adam, this usually won't converge for long sequences
optimizer = RAdam(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

In [22]:
NUM_EPOCHS = 35

for i in range(NUM_EPOCHS):
    
    for train_step, (x, y) in enumerate(train_dl):
        
        model.zero_grad()
        output = model(x)
        
        loss = loss_fn(output.view((-1, output.size(-1))), y.view(-1))
        
        #print(output[0][:10])
        if train_step % 100 == 0:
            print(i, train_step, loss)
               
        loss.backward()
        optimizer.step()

0 0 tensor(0.8194, device='cuda:0', grad_fn=<NllLossBackward>)
0 100 tensor(0.4995, device='cuda:0', grad_fn=<NllLossBackward>)
1 0 tensor(0.4675, device='cuda:0', grad_fn=<NllLossBackward>)
1 100 tensor(0.4274, device='cuda:0', grad_fn=<NllLossBackward>)
2 0 tensor(0.3787, device='cuda:0', grad_fn=<NllLossBackward>)
2 100 tensor(0.3015, device='cuda:0', grad_fn=<NllLossBackward>)
3 0 tensor(0.2349, device='cuda:0', grad_fn=<NllLossBackward>)
3 100 tensor(0.2139, device='cuda:0', grad_fn=<NllLossBackward>)
4 0 tensor(0.1484, device='cuda:0', grad_fn=<NllLossBackward>)
4 100 tensor(0.1420, device='cuda:0', grad_fn=<NllLossBackward>)
5 0 tensor(0.0877, device='cuda:0', grad_fn=<NllLossBackward>)
5 100 tensor(0.0831, device='cuda:0', grad_fn=<NllLossBackward>)
6 0 tensor(0.0533, device='cuda:0', grad_fn=<NllLossBackward>)
6 100 tensor(0.0690, device='cuda:0', grad_fn=<NllLossBackward>)
7 0 tensor(0.0243, device='cuda:0', grad_fn=<NllLossBackward>)
7 100 tensor(0.0400, device='cuda:0', gra

In [13]:
def get_output_for_example(x):
    
    logits = model(x.unsqueeze(0))
    probs = torch.softmax(logits, dim=2)
    out = torch.argmax(probs, 2)
    
    return out 

In [14]:
# Check if we're learned how to perform on items in the train dataset
x, y = train_ds[0]
y_hat = get_output_for_example(x)

print("X:\t", x)
print("y:\t", y)
print("y_hat:\t", y_hat.squeeze())

X:	 tensor([205, 456, 961, 961, 961, 961, 749, 574, 150, 492], device='cuda:0')
y:	 tensor([0, 0, 1, 1, 1, 1, 0, 0, 0, 0], device='cuda:0')
y_hat:	 tensor([0, 0, 1, 1, 1, 1, 0, 0, 0, 0], device='cuda:0')


In [190]:
# Check if we've learned how to perform on items out of sample
x = torch.from_numpy(np.arange(SEQUENCE_LENGTH)).long().cuda() * 100
x[0] = 9
x[1] = 1
x[2] = 222
x[3] = 50
x[4] = 250
x[8] = 800
x[9] = 800
y = torch.zeros_like(x)
y[0] = 1
y[1] = 1
y[2] = 1
y[3] = 0
y[4] = 0
y_hat = get_output_for_example(x)

print("X:\t", x)
print("y:\t", y)
print("y_hat:\t", y_hat.squeeze())

X:	 tensor([  9,   1, 222,  50, 250, 500, 600, 700, 800, 800], device='cuda:0')
y:	 tensor([1, 1, 1, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
y_hat:	 tensor([0, 0, 0, 0, 1, 1, 0, 0, 0, 0], device='cuda:0')
