# Lab 02 : Sequence-to-sequence transformers -- exercise

### Task   

The goal is to learn to translate an input sequence to an output sequence, which is simply the same input sequence but shifted to the right by one word.  

Example, if the input sequence is "some analysts expect oil prices to remain relatively" then the output sequence is "analysts expect oil prices to remain relatively high".  

We will use an encoder-decoder Transformer to achieve this goal on the PTB dataset. The decoder will start with a token "start_token" assigned to the 10,001-th word in the dictionary. 

Example, if the input sequence is "456 82 948 5892 34 4928 4758 567" then the output sequence is "10001 82 948 5892 34 4928 4758 567" and  the label sequence is "82 948 5892 34 4928 4758 567 745".


In [None]:
# For Google Colaboratory
import sys, os
if 'google.colab' in sys.modules:
    # mount google drive
    from google.colab import drive
    drive.mount('/content/gdrive')
    path_to_file = '/content/gdrive/My Drive/CS5242_2025_codes/labs_lecture07/lab02_seq2seq/'
    print(path_to_file)
    # change current path to the folder containing "file_name"
    os.chdir(path_to_file)
    !pwd

In [1]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import math
import time
import utils

### GPU

It is recommended to run this code on GPU:<br> 
* Time for 1 epoch on GPU : 1.5 sec w/ Google Colab Tesla P100-PCIE-16GB <br>

In [2]:
device= torch.device("cuda")
device= torch.device("cpu")
print(device)

if torch.cuda.is_available():
    print('cuda available with GPU:',torch.cuda.get_device_name(0))

cpu


### Download Penn Tree Bank

The tensor train_data consists of 20 columns of 46,479 words.<br>
The tensor test_data consists of 20 columns of 4,121 words.

In [3]:
from utils import check_ptb_dataset_exists
data_path=check_ptb_dataset_exists()

train_data  =  torch.load(data_path+'ptb/train_data.pt')
test_data   =  torch.load(data_path+'ptb/test_data.pt')

print(  train_data.size()  )
print(  test_data.size()   )

torch.Size([46479, 20])
torch.Size([4121, 20])


In [4]:
# Extract a sub-part of PTB
doc_len = 1001
train_data = train_data[:doc_len,:]
print(  train_data.size()  )

torch.Size([1001, 20])


### Some constants associated with the data set

In [5]:
bs = 20
vocab_size = 10000 + 1 # +1 for the start token

### Make an attention net class

In [None]:

def generate_positional_encoding(seq_length, dim):
    assert dim == 2* (dim//2) # check if dim is divisible by 2
    pe = torch.zeros(seq_length, dim)
    position = torch.arange(0, seq_length, dtype=torch.float).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, dim, 2).float() * (-torch.log(torch.tensor(10000.0)) / dim))
    pe[:,0::2] = torch.sin(position * div_term)
    pe[:,1::2] = torch.cos(position * div_term)
    return pe        
    

########### Encoder Transformer Block ###########
class AttentionHead_encoder(nn.Module):
    def __init__(self, d, d_head, dropout):
        super().__init__()
        self.query = nn.Linear(d, d_head, bias=False) # query embedding layer
        self.key = nn.Linear(d, d_head, bias=False) # key embedding layer
        self.value = nn.Linear(d, d_head) # value embedding layer
        self.dropout = nn.Dropout(dropout)
    def forward(self, H): # size(H)=[batch_size, seq_length, d]
        batch_size = H.size(0); batch_len = H.size(1); d = H.size(2)
        # self-attention encoder
        # COMPLETE HERE 
        Q = self.query(H) # [B, L, d_head]
        K = self.query(H) # [B, L, d_head]
        V = self.query(H) # [B, L, d_head]

        # note: no need mask
        attention_score = Q @ K.transpose(1,2) / d ** 0.5 # [B, L, d_head] @ # [B, d_head, L] -> [B, L, L]
        attention_score = torch.softmax(attention_score, dim=2) # [B, L, L]
        attention_score = self.dropout(attention_score) # dropout attention scores
        H_HA = attention_score @ V # [B, L, L] @ [B, L, d_head] -> [B, L, d_head]
        
        return H_HA # return prediction scores for next token
        
class MultipleAttentionHead_encoder(nn.Module):
    def __init__(self, d, num_heads, dropout):
        super().__init__()
        d_head = d // num_heads # dim_head = d // num_heads, usually dimension per head is 64
        assert d == d_head * num_heads # check divisibility
        self.MHA = nn.ModuleList([ AttentionHead_encoder(d, d_head, dropout) for _ in range(num_heads) ])
        self.WO = nn.Linear(d, d) # combination layer
        self.dropout = nn.Dropout(dropout)
    def forward(self, H): # size(H)=[batch_size, seq_length, d]
        batch_size = H.size(0); seq_length = H.size(1); d = H.size(2)
        H_heads = []
        # COMPLETE HERE 
        for ah in self.MHA:
            H_heads.append(ah(H)) # [B, L, d_head]
        
        H_heads = torch.cat(H_heads, dim=2) # size=[batch_size, seq_length, d]            
        H_heads = self.dropout(H_heads) # dropout attention activations
        H = self.WO(H_heads) # size=[batch_size, seq_length, d]
        return H

class TransformerBlock_encoder(nn.Module):
    def __init__(self, d, num_heads, dropout):
        super().__init__()
        self.LN_MHA = nn.LayerNorm(d)
        self.LN_MLP = nn.LayerNorm(d)
        self.MHA = MultipleAttentionHead_encoder(d, num_heads, dropout)
        self.MLP = nn.Sequential(nn.Linear(d,4*d), nn.ReLU(), nn.Dropout(dropout), nn.Linear(4*d,d))   # d -> d
    def forward(self, H): # size=[batch_size, seq_length, d]
        # Self-attention encoder 
        # COMPLETE HERE 
        h_bar = H + self.MHA(self.LN_MHA(H)) # [B, L, d] + [B, L, d]
        # MLP 
        # COMPLETE HERE 
        H = h_bar + self.MLP(self.LN_MLP(h_bar)) # [B, L, d] + [B, L, d]
        
        return H # size=[batch_size, seq_length, d]
    
class Transformer_encoder(nn.Module):
    # L = L_in
    def __init__(self, d, num_heads, num_blocks, seq_length, dropout):
        super().__init__()
        self.TR_Blocks = nn.ModuleList([ TransformerBlock_encoder(d, num_heads, dropout) for _ in range(num_blocks) ]) 
    def forward(self, batch_seq, pos_enc):  # batch_seq [L_in, B, d], pos_enc: [L_in, d]
        H = batch_seq.transpose(1,0)        # [B, L_in, d]
        batch_size = H.size(0); batch_len = H.size(1)
        # Add positional encoding  
        pos_enc = pos_enc.unsqueeze(dim=0)  # [1, L_in, d]
        # COMPLETE HERE 
        H = H + pos_enc                     # [B, L_in, d]
        
        # Apply transformer blocks 
        for TR_Block in self.TR_Blocks:
            H = TR_Block(H)                 # [B, L_in, d]
        # Output
        H = H.permute(1,0,2)  # [B, L_in, d] -> [L_in, B，D]
        return H # return prediction scores for next token
########### Encoder Transformer Block ###########
    

########### Decoder Transformer Block ###########
class SelfAttention_AttentionHead_decoder(nn.Module):
    def __init__(self, d, d_head, dropout):
        super().__init__()
        self.query = nn.Linear(d, d_head, bias=False) # query embedding layer
        self.key = nn.Linear(d, d_head, bias=False) # key embedding layer
        self.value = nn.Linear(d, d_head) # value embedding layer
        self.dropout = nn.Dropout(dropout)
    def forward(self, H): # size(H)=[batch_size, seq_length, d]
        # here batch_len is target length
        batch_size = H.size(0); batch_len = H.size(1); d = H.size(2)
        # Masked self-attention decoder
        # COMPLETE HERE 
        Q = self.query(H) # [B, L_out, d_head] 
        K = self.key(H)   # [B, L_out, d_head] 
        V = self.value(H) # [B, L_out, d_head] 

        attention_score = Q @ K.transpose(1, 2) / d ** 0.5 # [B, L_out, d_head] @ # [B, d_head, L_out] = [B, L_out, L_out] 
        mask = torch.tril(torch.ones(batch_len,batch_len)).long().to(device) # mask to use previous tokens only : { token(<=t) }, size=[batch_len,batch_len]
        attention_score = attention_score.masked_fill(mask==0, value=float('-inf')) # softmax(-inf)=0 prevents using next tokens for prediction, size=(batch_size, batch_len, batch_len)
        attention_score = torch.softmax(attention_score, dim=2) # sum weights = 1, size=[B, L_out, L_out]
        attention_score = self.dropout(attention_score) # dropout attention scores
        H_HA = attention_score @ V # softmax( QK^T / sqrt(d) ) V, [B, L_out, L_out] @ [B, L_out, d_head] => [B, L_out, d_head], size=[B, L_out, d_head]
        # B, L_out, d
        return H_HA # return prediction scores for next token
    
class SelfAttention_MultipleAttentionHead_decoder(nn.Module):
    def __init__(self, d, num_heads, dropout):
        super().__init__()
        d_head = d // num_heads # dim_head = d // num_heads, usually dimension per head is 64
        assert d == d_head * num_heads # check divisibility
        self.MHA = nn.ModuleList([ SelfAttention_AttentionHead_decoder(d, d_head, dropout) for _ in range(num_heads) ])
        self.WO = nn.Linear(d, d) # combination layer
        self.dropout = nn.Dropout(dropout)
    def forward(self, H):                               # size(H)=[B, L_out, d]
        batch_size = H.size(0); seq_length = H.size(1)
        H_heads = []
        for HA_layer in self.MHA:
            H_heads.append(HA_layer(H))                 # size=[B, L_out, d_head]
        H_heads = torch.cat(H_heads, dim=2)             # size=[B, L_out, d]          
        H_heads = self.dropout(H_heads)                 # dropout attention activations
        H = self.WO(H_heads)                            # size=[B, L_out, d]
        return H

class CrossAttention_AttentionHead_decoder(nn.Module):
    def __init__(self, d, d_head, dropout):
        super().__init__()
        self.query = nn.Linear(d, d_head, bias=False) # query embedding layer
        self.key = nn.Linear(d, d_head, bias=False) # key embedding layer
        self.value = nn.Linear(d, d_head) # value embedding layer
        self.dropout = nn.Dropout(dropout)
    def forward(self, H, Henc): # H: [B, L_out, d], Henc: [B, L_in, d]
        # here batch_len is L_out
        batch_size = H.size(0); batch_len = H.size(1); d = H.size(2)
        # Masked cross-attention
        # COMPLETE HERE 
        Q = self.query(H)     # [B, L_out, d_head]
        K = self.key(Henc)    # [B, L_in, d_head]
        V = self.value(Henc)  # [B, L_in, d_head]

        attention_score = Q @ K.transpose(1, 2) / d ** 0.5 # [B, L_out, d_head] @ # [B, d_head, L_in] = [B, L_out, L_in] 
        # I think we don't need mask here but I dont know why TA's answer has
        #mask = torch.tril(torch.ones(batch_len,batch_len)).long().to(device) # mask to use previous tokens only : { token(<=t) }, size=[batch_len,batch_len]
        # attention_score = attention_score.masked_fill(mask==0, value=float('-inf')) # softmax(-inf)=0 prevents using next tokens for prediction, size=(batch_size, batch_len, batch_len)
        attention_score = torch.softmax(attention_score, dim=2) # sum weights = 1, size=[B, L_out, L_in] 
        attention_score = self.dropout(attention_score) # dropout attention scores  [B, L_out, L_in] 
        H_HA = attention_score @ V # softmax( QK^T / sqrt(d) ) V, [B, L_out, L_in]  @ [B, L_in, d_head] => [B, L_out, d_head] , size=[B, L_out, d_head]

        return H_HA # return prediction scores for next token [B, L_out, d_head]
        
class CrossAttention_MultipleAttentionHead_decoder(nn.Module):
    def __init__(self, d, num_heads, dropout):
        super().__init__()
        d_head = d // num_heads                 # dim_head = d // num_heads, usually dimension per head is 64
        assert d == d_head * num_heads          # check divisibility
        self.MHA = nn.ModuleList([ CrossAttention_AttentionHead_decoder(d, d_head, dropout) for _ in range(num_heads) ])
        self.WO = nn.Linear(d, d)               # combination layer
        self.dropout = nn.Dropout(dropout)
    def forward(self, H, Henc):                 # H: [B, L_out, d], Henc: [B, L_in, d]
        batch_size = H.size(0); seq_length = H.size(1)
        H_heads = []
        for HA_layer in self.MHA:
            H_heads.append(HA_layer(H, Henc))   # [B, L_out, d_head]
        H_heads = torch.cat(H_heads, dim=2)     # [B, L_out, d] 
        H_heads = self.dropout(H_heads)         # dropout attention activations
        H = self.WO(H_heads)                    # [B, L_out, d_head]
        return H
    
class TransformerBlock_decoder(nn.Module):
    def __init__(self, d, num_heads, dropout):
        super().__init__()
        self.LN_MHA_H = nn.LayerNorm(d)
        self.LN_MHA_Henc = nn.LayerNorm(d)
        self.LN_MLP = nn.LayerNorm(d)
        self.SA_MHA = SelfAttention_MultipleAttentionHead_decoder(d, num_heads, dropout)
        self.CA_MHA = CrossAttention_MultipleAttentionHead_decoder(d, num_heads, dropout)
        self.MLP = nn.Sequential(nn.Linear(d,4*d), nn.ReLU(), nn.Dropout(dropout), nn.Linear(4*d,d))   
    def forward(self, H, Henc):                 # H: [B, L_out, d], Henc: [B, L_in, d]
        # Masked self-attention decoder
        # COMPLETE HERE 
        H_bar = H + self.SA_MHA(self.LN_MHA_H(H))  # H_bar & H: [B, L_out, d]
        
        # Masked cross-attention decoder
        # COMPLETE HERE 
        H_hat = H_bar + self.CA_MHA(self.LN_MHA_H(H_bar), self.LN_MHA_Henc(Henc))  # H_hat: [B, L_out, d]
        
        # MLP 
        # COMPLETE HERE 
        H = H_hat + self.MLP(self.LN_MLP(H_hat)) # H_hat: [B, L_out, d]
        return H # H_hat: [B, L_out, d]
           
class Transformer_decoder(nn.Module):
    def __init__(self, d, num_heads, num_blocks, seq_length, dropout):
        super().__init__()
        self.TR_Blocks = nn.ModuleList([ TransformerBlock_decoder(d, num_heads, dropout) for _ in range(num_blocks) ]) 
    
    def forward(self, g_seq_out, h_enc_seq, pos_enc):   # g_seq_out: [L_out, B, d], h_enc_seq: [L_in, B, d], pos_enc: [L_out, d]
        H = g_seq_out.transpose(1, 0)       # [B, L_out, d]
        Henc = h_enc_seq.transpose(1, 0)    # [B, L_in, d]
        batch_size = H.size(0); batch_len = H.size(1)
        # Add positional encoding  
        pos_enc = pos_enc.unsqueeze(dim=0)  # [1, L_out, d]
        H = H + pos_enc                     # [B, L_out, d]
        # Apply transformer blocks 
        for TR_Block in self.TR_Blocks:
            H = TR_Block(H, Henc)           # H: [B, L_out, d] H_enc: [B, L_out, d] -> H: [B, L_out, d]
        # Output
        H = H.permute(1,0,2)                # H: [B, L_out, d] -> H: [L_out, B d]
        return H # return prediction scores for next token
########### Decoder Transformer Block ###########

    
class ANN(nn.Module):
    
    def __init__(self, d, num_heads, num_blocks, seq_length, dropout):
        super(ANN, self).__init__()
        self.encoder = Transformer_encoder(d, num_heads, num_blocks, seq_length, dropout)
        self.decoder = Transformer_decoder(d, num_heads, num_blocks, seq_length, dropout)
    
    def forward(self, g_seq_in , g_seq_out, pos ):
        # g_seq_in:  [L_in, B, d]
        # g_seq_out: [L_out, B, d]
        # pos:       [L_max, d]
        h_enc_seq = self.encoder( g_seq_in , pos )              # size=[L_in, B, d], [30, 20, 128]
        h_dec_seq = self.decoder( g_seq_out, h_enc_seq , pos )  # size=[L_out, B, d], [30, 20, 128]
        return h_dec_seq 
    

class attention_net(nn.Module):

    def __init__(self, d, num_heads, num_blocks, seq_length, dropout):
        super(attention_net, self).__init__()  
        self.layer1 = nn.Embedding( vocab_size  , hidden_size  )
        self.layer2 = ANN(d, num_heads, num_blocks, seq_length, dropout)
        self.layer3 = nn.Linear(    hidden_size , vocab_size   )

    def forward(self, word_seq_in, word_seq_out, pos ):
        # word_seq_in:  [L_in, B] (input token IDs)
        # word_seq_out: [L_out, B] (output token IDs)
        # pos:          [L_max, d] (positional encoding table)
        g_seq_in     =   self.layer1( word_seq_in )             # size=[L_in, B, d], [30, 20, 128]
        g_seq_out     =   self.layer1( word_seq_out )           # size=[L_out, B, d], [30, 20, 128]
        h_seq     =   self.layer2( g_seq_in , g_seq_out, pos )  # size=[L_out, B, d], [30, 20, 128] 
        score_seq =   self.layer3( h_seq )                      # size=[L_out, B, vocab_size]
        return score_seq 


### Build the net. Choose the hidden size to be 128 and the number of heads to be 16. 
### How many parameters in total?

In [26]:
hidden_size = 128 
num_heads = 16
num_blocks = 2
seq_length = 100

net = attention_net(hidden_size, num_heads, num_blocks, seq_length, dropout=0.0)
print(net)
utils.display_num_param(net)

attention_net(
  (layer1): Embedding(10001, 128)
  (layer2): ANN(
    (encoder): Transformer_encoder(
      (TR_Blocks): ModuleList(
        (0-1): 2 x TransformerBlock_encoder(
          (LN_MHA): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
          (LN_MLP): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
          (MHA): MultipleAttentionHead_encoder(
            (MHA): ModuleList(
              (0-15): 16 x AttentionHead_encoder(
                (query): Linear(in_features=128, out_features=8, bias=False)
                (key): Linear(in_features=128, out_features=8, bias=False)
                (value): Linear(in_features=128, out_features=8, bias=True)
                (dropout): Dropout(p=0.0, inplace=False)
              )
            )
            (WO): Linear(in_features=128, out_features=128, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (MLP): Sequential(
            (0): Linear(in_features=128, out_features=512, bias=T

### Send the weights of the networks to the GPU

In [27]:
net = net.to(device)

### Choose the loss to be the cross-entropy and the optimizer to be Adam, as well as the hyperparameters: 
* initial learning rate = 0.001
* sequence length = 100

In [28]:
criterion = nn.CrossEntropyLoss()

my_lr = 0.001
seq_length = 100
optimizer = torch.optim.Adam(net.parameters(), lr=my_lr)

pos = generate_positional_encoding(seq_length, hidden_size).to(device) # size=(seq_length, hidden_dim)

### Do 5 passes through the training set
### Observe the train perplexity

In [29]:
start=time.time()
for epoch in range(50):
    
    # set the running quantities to zero at the beginning of the epoch
    running_loss=0
    num_batches=0    
    doc_len = train_data.size(0)
    for count in range( 0 , doc_len-seq_length ,  seq_length): 
        
        # Set the gradients to zeros
        optimizer.zero_grad()
        
        # create a minibatch
        minibatch_seq_in = train_data[count:count+seq_length]
        start_token = torch.tensor([vocab_size-1]).repeat(1,bs)
        minibatch_seq_out = torch.cat((start_token, train_data[count+1:count+seq_length]))
        minibatch_label = train_data[count+1:count+seq_length+1]
        
        # send them to the gpu
        minibatch_seq_in = minibatch_seq_in.to(device)
        minibatch_seq_out = minibatch_seq_out.to(device)
        minibatch_label = minibatch_label.to(device)
        
        # forward the minibatch through the net        
        scores = net( minibatch_seq_in, minibatch_seq_out, pos ) # size=(seq_length, bs, vocab_size)

        # reshape the scores and labels to huge batch of size bs*seq_length
        scores = scores.view(  bs*seq_length , vocab_size) # size=(seq_length.bs, vocab_size)
        minibatch_label = minibatch_label.view(  bs*seq_length ) # size=(seq_length.bs, vocab_size)
       
        # Compute the average of the losses of the data points in this huge batch
        loss = criterion(scores, minibatch_label)
        
        # backward pass to compute dL/dR, dL/dV and dL/dW
        loss.backward()

        # do one step of stochastic gradient descent: R=R-lr(dL/dR), V=V-lr(dL/dV), ...
        optimizer.step()
        
        # update the running loss  
        running_loss += loss.item()
        num_batches += 1
        
    # compute stats for the full training set
    total_loss = running_loss/num_batches
    elapsed = time.time()-start
    
    if not epoch%1:
        print('epoch=',epoch, '\t time=', elapsed,'\t lr=', my_lr, '\t exp(loss)=',  math.exp(total_loss))


epoch= 0 	 time= 4.271215200424194 	 lr= 0.001 	 exp(loss)= 3723.2157235638233
epoch= 1 	 time= 8.349858045578003 	 lr= 0.001 	 exp(loss)= 673.5133838998547
epoch= 2 	 time= 12.450402021408081 	 lr= 0.001 	 exp(loss)= 453.38824427978386
epoch= 3 	 time= 16.79450297355652 	 lr= 0.001 	 exp(loss)= 371.8541335294012
epoch= 4 	 time= 20.852574586868286 	 lr= 0.001 	 exp(loss)= 307.7660698980238
epoch= 5 	 time= 24.89178442955017 	 lr= 0.001 	 exp(loss)= 254.49373490395504
epoch= 6 	 time= 28.935091733932495 	 lr= 0.001 	 exp(loss)= 209.27383666106348
epoch= 7 	 time= 32.996251821517944 	 lr= 0.001 	 exp(loss)= 170.00844172662713
epoch= 8 	 time= 37.16367149353027 	 lr= 0.001 	 exp(loss)= 136.46126507394268
epoch= 9 	 time= 41.20633149147034 	 lr= 0.001 	 exp(loss)= 109.33712269769273
epoch= 10 	 time= 45.52356219291687 	 lr= 0.001 	 exp(loss)= 87.84456223400461
epoch= 11 	 time= 49.56982064247131 	 lr= 0.001 	 exp(loss)= 70.92855936630086
epoch= 12 	 time= 53.53654909133911 	 lr= 0.001 	 e

### Check if the network was successful 

In [24]:
minibatch_seq_in = train_data[count:count+seq_length, 1].unsqueeze(1).to(device)
print('Input sequence:', minibatch_seq_in[:,0])
start_token = torch.tensor([vocab_size-1]).repeat(1,bs).to(device)
minibatch_seq_out = torch.cat((start_token, train_data[count+1:count+seq_length].to(device)))
minibatch_seq_out = minibatch_seq_out[:,1].unsqueeze(1)
minibatch_label = train_data[count+1:count+seq_length+1,1].unsqueeze(1).to(device)
print('\nExpected output sequence:', minibatch_label[:,0])
pos = generate_positional_encoding(seq_length, hidden_size) # size=(seq_length, hidden_dim) 
pos = pos.to(device)
scores = net( minibatch_seq_in, minibatch_seq_out, pos ) # size=(seq_length, bs, vocab_size)
seq = scores.squeeze().argmax(dim=1)
print('\nPredicted output sequence:', seq)
            

Input sequence: tensor([  64, 1519,   24,   32, 5166, 3719,   48, 1705, 3940,   32,   26, 1485,
          42, 2774, 2196,   32, 5166,   64,   26,  270, 1522, 1978,  108,   32,
         833, 2876, 2810, 1342,   32,   26, 2072, 5167,   42, 5164,   26,   98,
         108,   35, 5168, 3755,  636,   32, 5159,   48,  729,   64,   65, 2813,
        2217,  591,  424,   32, 2476,   83,   93, 1040,  432, 3303,   24,   32,
         929,  887,   32,  101, 5166,  546,  315,  895,  108, 1035, 3719,  243,
          93,  504,  874,  566, 5169,   64, 5170, 5171,  169,   26, 5166,  156,
        5172,   24,   32, 5166,   98,   93,  152, 1881,  135,  623, 2358,   32,
          26, 1978,  566,  895])

Expected output sequence: tensor([1519,   24,   32, 5166, 3719,   48, 1705, 3940,   32,   26, 1485,   42,
        2774, 2196,   32, 5166,   64,   26,  270, 1522, 1978,  108,   32,  833,
        2876, 2810, 1342,   32,   26, 2072, 5167,   42, 5164,   26,   98,  108,
          35, 5168, 3755,  636,   32, 5159, 