In [1]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Optional

## Implementation of scaled dot product attention

$$ \text{Attention}(Q, K, V) = \text{softmax}(\frac{Q K^T}{\sqrt{d_k}}) V $$
With $Q$ the query, $K$ the key and $V$ the value as follows:

$$ Q = W_q \times X $$
$$ K = W_k \times X $$
$$ V = W_v \times X $$

Suppose that $X \in \mathbb{R}^{N \times d_x}$, $W_q \in \mathbb{R}^{d_q \times d_x}$, $W_k \in \mathbb{R}^{d_k \times d_x}$, $W_v \in \mathbb{R}^{d_v \times d_x}$. Then the scaled dot product attention belong to $\mathbb{R}^{N \times d_v}$

In [2]:
def scaled_dot_product(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, mask: Optional[torch.Tensor]=None):
    d_k = q.size(-1)
    scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -math.inf)
    p_attn = F.softmax(scores, dim=-1)
    return torch.matmul(p_attn, v), p_attn

Example for three sequences and embeddings of size 3:

$$ X = \begin{bmatrix}
    x_1 & x_2 & x_3 \\
    x_4 & x_5 & x_6 \\
    x_7 & x_8 & x_9 \\
\end{bmatrix} $$

We have the following matrices:

$$ W_q = \begin{bmatrix}
    q_1 & q_2 & q_3 \\
    q_4 & q_5 & q_6 \\
    q_7 & q_8 & q_9 \\
\end{bmatrix} $$

$$ W_k = \begin{bmatrix}
    k_1 & k_2 & k_3 \\
    k_4 & k_5 & k_6 \\
    k_7 & k_8 & k_9 \\
\end{bmatrix} $$

$$ W_v = \begin{bmatrix}
    v_1 & v_2 & v_3 \\
    v_4 & v_5 & v_6 \\
    v_7 & v_8 & v_9 \\
\end{bmatrix} $$

In [3]:
X = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float32)
Q = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float32)
K = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float32)
V = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float32)
v, attn = scaled_dot_product(Q, K, V)

In [4]:
print(attn)

tensor([[9.4047e-10, 3.0667e-05, 9.9997e-01],
        [2.7127e-23, 5.2083e-12, 1.0000e+00],
        [7.8241e-37, 8.8454e-19, 1.0000e+00]])


In [5]:
## With mask

mask = torch.tensor([[1, 0, 0],
                     [1, 1, 0],
                     [1, 1, 1]],
                     dtype=torch.float32
                     )
v, attn = scaled_dot_product(Q, K, V, mask)
print(attn)

tensor([[1.0000e+00, 0.0000e+00, 0.0000e+00],
        [5.2083e-12, 1.0000e+00, 0.0000e+00],
        [7.8241e-37, 8.8454e-19, 1.0000e+00]])


In [6]:
class Attention(nn.Module):
    def __init__(self, d_in):
        super(Attention, self).__init__()
        self.W_k = nn.Linear(d_in, d_in)
        self.W_q = nn.Linear(d_in, d_in)
        self.W_v = nn.Linear(d_in, d_in)
        
    def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
                mask: Optional[torch.Tensor]=None):
        Q = self.W_q(query)
        K = self.W_k(key)
        V = self.W_v(value)
        return scaled_dot_product(Q, K, V, mask)


class MultiHeadAttention1(nn.Module):
    def __init__(self, n_heads: int, d_in: int):
        super(MultiHeadAttention1, self).__init__()
        self.n_heads = n_heads
        self.attention = nn.ModuleList([Attention(d_in) for _ in range(n_heads)])
        self.linear = nn.Linear(n_heads * d_in, d_in)
        
    def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
                mask: Optional[torch.Tensor]=None) -> torch.Tensor:

        output = torch.cat([attn(query, key, value, mask)[0] for attn in self.attention], dim=-1)
        output = self.linear(output)
        return  output
        

In [43]:
class MultiHeadAttention(nn.Module):
    def __init__(self, n_heads: int, d_model: int):
        super(MultiHeadAttention, self).__init__()
        assert d_model % n_heads == 0, "d_model must be divisible by n_heads"
        self.n_heads = n_heads
        self.d_head = d_model // n_heads
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.out_proj = nn.Linear(d_model, d_model)

    def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
                mask: Optional[torch.Tensor] = None):
        B, T, D = query.size()

        # Linear projections
        print(f"query shape: {query.shape} key shape: {key.shape} value shape: {value.shape}")
        Q = self.W_q(query).view(B, T, self.n_heads, self.d_head).transpose(1, 2)  # [B, H, T, d_head]
        K = self.W_k(key).view(B, T, self.n_heads, self.d_head).transpose(1, 2)
        V = self.W_v(value).view(B, T, self.n_heads, self.d_head).transpose(1, 2)

        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_head)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))
        attn_weights = torch.softmax(scores, dim=-1)
        context = torch.matmul(attn_weights, V)  # [B, H, T, d_head]
        context = context.transpose(1, 2).contiguous().view(B, T, D)  # [B, T, D]
        return self.out_proj(context)


In [44]:
attn = Attention(3)
query = torch.randn(2, 5, 3)
key = torch.randn(2, 5, 3)
value = torch.randn(2, 5, 3)

output = attn(query, key, value)
print(output[0].shape)

torch.Size([2, 5, 3])


In [45]:
batch_size = 2
seq_len = 5
d_model = 16
n_heads = 4

query = torch.randn(batch_size, seq_len, d_model)
key = torch.randn(batch_size, seq_len, d_model)
value = torch.randn(batch_size, seq_len, d_model)
mha = MultiHeadAttention(n_heads=n_heads, d_model=d_model)
output = mha(query, key, value)
output.shape

query shape: torch.Size([2, 5, 16]) key shape: torch.Size([2, 5, 16]) value shape: torch.Size([2, 5, 16])


torch.Size([2, 5, 16])

In [46]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.d_model = d_model
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x: torch.Tensor):
        x = x + self.pe[:, :x.size(1)]  # type: ignore
        return x

In [47]:
d_model = 16
seq_len = 10
batch_size = 2

# Random input tensor [batch_size, seq_len, d_model]
x = torch.randn(batch_size, seq_len, d_model)

# Instantiate and run PositionalEncoding
pos_encoder = PositionalEncoding(d_model)
x_encoded = pos_encoder(x)

In [48]:
class FFN(nn.Module):
    def __init__(self, d_model: int, d_ffn: int, dropout: float = 0.1):
        super(FFN, self).__init__()
        self.linear = nn.Sequential(
            nn.Linear(d_model, d_ffn),
            nn.Dropout(dropout),
            nn.ReLU(),
            nn.Linear(d_ffn, d_model),
            nn.Dropout(dropout)
        )
        
    def forward(self, x: torch.Tensor):
        return self.linear(x)

In [49]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model: int, n_heads: int, d_ffn: int, dropout: float = 0.1):
        super(EncoderLayer, self).__init__()
        self.mha = MultiHeadAttention(n_heads, d_model)
        self.ffn = FFN(d_model, d_ffn, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None):
        attn_output = self.mha(x, x, x, mask)
        attn_output = self.dropout(attn_output)
        x = self.norm1(x + attn_output)
        ffn_output = self.ffn(x)
        ffn_output = self.dropout(ffn_output)
        x = self.norm2(x + ffn_output)
        return x

In [50]:
batch_size = 2
seq_len = 5
d_model = 16
n_heads = 4
d_ffn = 64
encoder = EncoderLayer(d_model=d_model, n_heads=n_heads, d_ffn=d_ffn)
x = torch.randn(batch_size, seq_len, d_model)
output = encoder(x)
print("Input shape:", x.shape)
print("Output shape:", output.shape)

query shape: torch.Size([2, 5, 16]) key shape: torch.Size([2, 5, 16]) value shape: torch.Size([2, 5, 16])
Input shape: torch.Size([2, 5, 16])
Output shape: torch.Size([2, 5, 16])


In [51]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ffn, dropout=0.1):
        super(DecoderLayer, self).__init__()
        self.mha1 = MultiHeadAttention(n_heads, d_model)
        self.mha2 = MultiHeadAttention(n_heads, d_model)
        self.ffn = FFN(d_model, d_ffn, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x: torch.Tensor, encoder_output: torch.Tensor,
                src_mask: Optional[torch.Tensor] = None,
                tgt_mask: Optional[torch.Tensor] = None):
        _x = x
        x = self.mha1(x, x, x, tgt_mask)
        x = self.dropout(x)
        x = self.norm1(x + _x)
        _x = x
        x = self.mha2(x, encoder_output, encoder_output, src_mask)
        x = self.dropout(x)
        x = self.norm2(x + _x)
        _x = x
        x = self.ffn(x)
        x = self.dropout(x)
        x = self.norm3(x + _x)
        return x

In [52]:
batch_size = 2
seq_len = 5
d_model = 16
n_heads = 4
d_ffn = 64

x = torch.randn(batch_size, seq_len, d_model)
encoder = EncoderLayer(d_model, n_heads, d_ffn)

output = encoder(x)
print("Input shape:", x.shape)
print("Output shape:", output.shape)

decoder = DecoderLayer(d_model, n_heads, d_ffn)

output = decoder(x, x)
print("Input shape:", x.shape)
print("Output shape:", output.shape)


query shape: torch.Size([2, 5, 16]) key shape: torch.Size([2, 5, 16]) value shape: torch.Size([2, 5, 16])
Input shape: torch.Size([2, 5, 16])
Output shape: torch.Size([2, 5, 16])
query shape: torch.Size([2, 5, 16]) key shape: torch.Size([2, 5, 16]) value shape: torch.Size([2, 5, 16])
query shape: torch.Size([2, 5, 16]) key shape: torch.Size([2, 5, 16]) value shape: torch.Size([2, 5, 16])
Input shape: torch.Size([2, 5, 16])
Output shape: torch.Size([2, 5, 16])


In [64]:
class Encoder(nn.Module):
    def __init__(self, vocab_size: int, d_model: int, n_heads: int,
                 d_ffn: int, num_layers: int, dropout: float=0.1):
        super(Encoder, self).__init__()
        self.token_embd = nn.Embedding(vocab_size, d_model)
        self.embd = PositionalEncoding(d_model)
        self.layers = nn.ModuleList([EncoderLayer(d_model, n_heads, d_ffn, dropout) for _ in range(num_layers)])
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        x = self.token_embd(x)
        x = self.embd(x)
        x = self.dropout(x)
        for layer in self.layers:
            x = layer(x, mask)
        return x

In [65]:
class Decoder(nn.Module):
    def __init__(self, vocab_size: int, d_model: int, n_heads: int,
                 d_ffn: int, num_layers: int, dropout: float=0.1):
        super(Decoder, self).__init__()
        self.token_embd = nn.Embedding(vocab_size, d_model)
        self.embd = PositionalEncoding(d_model)
        self.dropout = nn.Dropout(dropout)
        self.layers = nn.ModuleList([DecoderLayer(d_model, n_heads, d_ffn, dropout)
                                     for _ in range(num_layers)])

    def forward(self, x, encoder_output, src_mask=None, tgt_mask=None):
        x = self.token_embd(x)
        x = self.embd(x)
        x = self.dropout(x)
        for layer in self.layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)
        return x

In [80]:
class Transformer(nn.Module):
    def __init__(self, enc_vocab_size: int,dec_vocab_size: int, d_model: int,
                 n_heads: int, d_ffn: int, num_layers: int, dropout: float=0.1):
        super(Transformer, self).__init__()
        self.encoder = Encoder(vocab_size=enc_vocab_size, d_model=d_model, n_heads=n_heads,
                               d_ffn=d_ffn, num_layers=num_layers, dropout=dropout)
        self.decoder = Decoder(vocab_size=dec_vocab_size, d_model=d_model, n_heads=n_heads,
                               d_ffn=d_ffn, num_layers=num_layers, dropout=dropout)
        self.out_proj = nn.Linear(d_model, dec_vocab_size)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        encoder_output = self.encoder(src, src_mask)
        decoder_output = self.decoder(tgt, encoder_output, src_mask, tgt_mask)
        return self.out_proj(decoder_output)

In [81]:
batch_size = 2
seq_len = 5
d_model = 16
n_heads = 4
d_ffn = 64
num_layers = 2
vocab_size = 100

x = torch.randint(0, vocab_size, (batch_size, seq_len))  # token indices for encoder input
y = torch.randint(0, vocab_size, (batch_size, seq_len))  # token indices for decoder input

transformer = Transformer(enc_vocab_size=vocab_size,dec_vocab_size=vocab_size,
                          d_model=d_model, n_heads=n_heads, d_ffn=d_ffn,
                          num_layers=num_layers)

output = transformer(x, y)
print("Input shape:", x.shape)
print("Output shape:", output.shape)

query shape: torch.Size([2, 5, 16]) key shape: torch.Size([2, 5, 16]) value shape: torch.Size([2, 5, 16])
query shape: torch.Size([2, 5, 16]) key shape: torch.Size([2, 5, 16]) value shape: torch.Size([2, 5, 16])
query shape: torch.Size([2, 5, 16]) key shape: torch.Size([2, 5, 16]) value shape: torch.Size([2, 5, 16])
query shape: torch.Size([2, 5, 16]) key shape: torch.Size([2, 5, 16]) value shape: torch.Size([2, 5, 16])
query shape: torch.Size([2, 5, 16]) key shape: torch.Size([2, 5, 16]) value shape: torch.Size([2, 5, 16])
query shape: torch.Size([2, 5, 16]) key shape: torch.Size([2, 5, 16]) value shape: torch.Size([2, 5, 16])
Input shape: torch.Size([2, 5])
Output shape: torch.Size([2, 5, 100])


- [Ref1](https://github.com/hyunwoongko/transformer/tree/master)
- [Ref2](https://goyalpramod.github.io/blogs/Transformers_laid_out/)

In [82]:
from datasets import load_dataset
PAD_ID = 0
SOS_ID = 1
EOS_ID = 2
dataset = load_dataset("bentrevett/multi30k")

In [83]:
def generate_square_subsequent_mask(sz):
    """Generate a lower triangular matrix for causal masking."""
    return torch.tril(torch.ones((sz, sz), dtype=torch.bool))


def create_masks(src, tgt_input, pad_id=PAD_ID):
    src_mask = (src != pad_id).unsqueeze(1).unsqueeze(2)  # [B, 1, 1, S]
    tgt_mask = (tgt_input != pad_id).unsqueeze(1).unsqueeze(2)  # [B, 1, 1, T]
    size = tgt_input.size(1)
    causal_mask = generate_square_subsequent_mask(size).to(tgt_input.device)  # [T, T]
    combined_mask = tgt_mask & causal_mask.unsqueeze(0).unsqueeze(0)  # [B, 1, T, T]
    return src_mask, combined_mask

In [84]:
class Multi30kDataset(torch.utils.data.Dataset):
    def __init__(self, hf_dataset, tokenizer, max_len=50):
        self.dataset = hf_dataset
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        src_text = self.dataset[idx]["en"]
        tgt_text = self.dataset[idx]["de"]
        src_tokens = self.tokenizer.encode(src_text)[:self.max_len]
        tgt_tokens = self.tokenizer.encode(tgt_text)[:self.max_len]
        return {
            "src": torch.tensor(src_tokens, dtype=torch.long),
            "tgt": torch.tensor(tgt_tokens, dtype=torch.long),
        }


def collate_fn(batch):
    def pad_sequence(seqs, pad_id=PAD_ID):
        max_len = max(seq.size(0) for seq in seqs)
        return torch.stack([
            torch.cat([seq, torch.full((max_len - len(seq),), pad_id, dtype=torch.long)])
            for seq in seqs
        ])

    src_seqs = [item["src"] for item in batch]
    tgt_seqs = [item["tgt"] for item in batch]

    return pad_sequence(src_seqs), pad_sequence(tgt_seqs)

In [85]:
from loguru import logger
from collections import defaultdict
from torch.utils.data import DataLoader
from sacrebleu.metrics import BLEU, CHRF

def averager(beta: float = 1):
    """
    Returns a single function that can be called to repeatidly obtain
    a running average from a dictionary of metrics.
    The callback will return the new averaged dict of metrics.

    `beta` is the decay parameter. If `beta == 1`, a regular running
    average is performed. If `beta < 1`, an exponential moving average
    is performed instead.
    """
    count = defaultdict(float)
    total = defaultdict(float)

    def _update(metrics: dict, weight: float = 1) -> dict:
        nonlocal total, count
        for key, value in metrics.items():
            total[key] = total[key] * beta + weight * float(value)
            count[key] = count[key] * beta + weight
        return {key: tot / count[key] for key, tot in total.items()}
    return _update


def do_epoch(epoch: int, model: nn.Module, loader: DataLoader, criterion: nn.Module, tokenizer, optimizer=None):
    """Run a single epoch, either in training or evaluation mode, if `optimizer` is None."""

    device = next(model.parameters()).device
    model.train() if optimizer is not None else model.eval()
    average = averager()
    bleu = BLEU()
    chrf = CHRF()

    for src, tgt in loader:
        print("src", src.shape, "tgt", tgt.shape)
        src, tgt = src.to(device), tgt.to(device)
        src_mask, tgt_mask = create_masks(src, tgt)
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]
        with torch.set_grad_enabled(optimizer is not None):
            prediction = model(src, tgt_input,
                            #    src_mask=src_mask, tgt_mask=tgt_mask,
                               )
            print("prediction", prediction.shape)
            print("tgt_output", tgt_output.shape)
            loss = criterion(prediction.reshape(-1, prediction.size(-1)), tgt_output.reshape(-1))
            pred_tokens = prediction.argmax(dim=-1)
            references = [[tokenizer.decode(ref.tolist(), skip_special_tokens=True).split()] for ref in tgt_output]
            hypotheses = [tokenizer.decode(hyp.tolist(), skip_special_tokens=True).split() for hyp in pred_tokens]
            score = bleu.corpus_score(hypotheses=hypotheses, references=references)
            chrf_score = chrf.corpus_score(hypotheses=hypotheses, references=references)

        metrics = {
            'loss': loss,
            'bleu': score.score,
            'chrf': chrf_score.score

        }
        metrics = average(metrics)

        if optimizer is not None:
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

    label = 'test' if optimizer is None else 'train'
    logger.info(f'Epoch {epoch:03d} {label: <5} summary '
                f'loss: {metrics["loss"]:.3f}, '
                f'acc.: {metrics["accuracy"]:6.2%}')
    return metrics

In [86]:
import tiktoken

def main():
    device = torch.device("cpu" if torch.cuda.is_available() else "cpu")
    tokenizer = tiktoken.get_encoding("gpt2")

    # Load datasets
    train_ds = load_dataset("bentrevett/multi30k", split="train").take(10)
    val_ds = load_dataset("bentrevett/multi30k", split="validation").take(5)

    # Build datasets + loaders
    train_dataset = Multi30kDataset(train_ds, tokenizer, max_len=50)
    
    val_dataset = Multi30kDataset(val_ds, tokenizer, max_len=50)

    train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)

    # Model
    vocab_size = tokenizer.n_vocab
    model = Transformer(enc_vocab_size=vocab_size, dec_vocab_size=vocab_size,
                        d_ffn=2048, d_model=512, n_heads=8, num_layers=6).to(device)

    # Loss + optimizer
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

    # Train loop
    epochs = 5
    for epoch in range(epochs):
        train_loss = do_epoch(model=model,criterion=criterion, loader=train_loader, optimizer=optimizer,tokenizer=tokenizer, epoch=epoch)
        val_loss = do_epoch(model=model,criterion=criterion, loader=val_loader,tokenizer=tokenizer, epoch=epoch)
        print(f"Epoch {epoch+1} / {epochs}: Train loss {train_loss:.4f} | Val loss {val_loss:.4f}")

In [87]:
main()

src torch.Size([2, 15]) tgt torch.Size([2, 34])
query shape: torch.Size([2, 15, 512]) key shape: torch.Size([2, 15, 512]) value shape: torch.Size([2, 15, 512])
query shape: torch.Size([2, 15, 512]) key shape: torch.Size([2, 15, 512]) value shape: torch.Size([2, 15, 512])
query shape: torch.Size([2, 15, 512]) key shape: torch.Size([2, 15, 512]) value shape: torch.Size([2, 15, 512])
query shape: torch.Size([2, 15, 512]) key shape: torch.Size([2, 15, 512]) value shape: torch.Size([2, 15, 512])
query shape: torch.Size([2, 15, 512]) key shape: torch.Size([2, 15, 512]) value shape: torch.Size([2, 15, 512])
query shape: torch.Size([2, 15, 512]) key shape: torch.Size([2, 15, 512]) value shape: torch.Size([2, 15, 512])
query shape: torch.Size([2, 33, 512]) key shape: torch.Size([2, 33, 512]) value shape: torch.Size([2, 33, 512])
query shape: torch.Size([2, 33, 512]) key shape: torch.Size([2, 15, 512]) value shape: torch.Size([2, 15, 512])


RuntimeError: shape '[2, 33, 8, 64]' is invalid for input of size 15360

In [None]:
train_ds = load_dataset("bentrevett/multi30k", split="train").take(10)
tokenizer = tiktoken.get_encoding("gpt2")
train_dataset = Multi30kDataset(train_ds, tokenizer, max_len=50)

In [26]:
next(iter(train_dataset))

{'src': tensor([ 7571,  1862,    11,  2635, 10835,   389,  2354,  1474,   867, 37413,
            13]),
 'tgt': tensor([   57, 42990, 10891,   469,   356,    72, 39683,    68,   337, 11033,
            77,  1008,   264,   521,   545,  4848,  2013,   287,  4587,   399,
         11033,   258,   410,  8207,   263,   347,  9116, 15952,    13])}