### 2023.10.19 - Introduction to Transformers | Homework 1
In this exercise, you will implement your own character-based Tokenizer as well as an Embedding layer from scratch.

To make your tokenizer more robust add a special $<UNK>$ character to your vocabulary.
If a token id is not found in the vocabulary, this character or its token id should be returned instead.

In [None]:
import torch
from torch import Tensor
from typing import List, Iterator

In [142]:
class Tokenizer:
    def __init__(self, vocab: List[str]):
        pass

    def parse(self, input: str) -> List[str]:
        """Convert a string to a list of characters."""
        pass

    def encode(self, tokens: List[str]) -> List[int]:
        """Encode a list of tokens into their corresponding indices."""
        pass

    def decode(self, indices: List[int]) -> str:
        """Decode a list of indices back into a string."""
        pass

In [160]:
class Embedding:
    def __init__(self, n_embd: int, d_embd: int):
        pass

    def forward(self, input: Tensor) -> Tensor:
        """Perform a lookup for the given indices in the embedding table."""
        pass
        
    def __call__(self, input: Tensor) -> Tensor:
        return self.forward(input)

In [146]:
vocab = list('') # define your vocabulary here
# ...

### Solution Example Simple

In [117]:
import torch
from torch import Tensor
from typing import List, Iterator

In [150]:
class Tokenizer:
    def __init__(self, vocab: List[str]):
        # Ensure the '<UNK>' token is in the vocabulary
        if '<UNK>' not in vocab:
            vocab.append('<UNK>')

        self.vocab = vocab
        self.token2idx = {token: idx for idx, token in enumerate(vocab)}
        self.idx2token = {idx: token for idx, token in enumerate(vocab)}

    def parse(self, input: str) -> List[str]:
        """Convert a string to a list of characters."""
        return list(input)

    def encode(self, tokens: List[str]) -> List[int]:
        """Encode a list of tokens into their corresponding indices."""
        return [self.token2idx.get(token, self.token2idx.get('<UNK>', None)) for token in tokens]

    def decode(self, indices: List[int]) -> str:
        """Decode a list of indices back into a string."""
        return ''.join([self.idx2token.get(idx, '<UNK>') for idx in indices])

In [159]:
vocab = list("abcdefghijklmnopqrstuvwxyz")
tokenizer = Tokenizer(vocab)

# Test parsing
tokens = tokenizer.parse('caterpillar')
print(f"tokenizer.parse: {tokens}")

# Test encoding
token_ids = tokenizer.encode(tokens)
print(f"tokenizer.encode: {token_ids}")

# Test decoding
print(f"tokenizer.decode: {tokenizer.decode(token_ids)}")

# Test <UNK>
print(f"tokenizer.encode/decode unknown: {tokenizer.decode(tokenizer.encode(['$']))}")
print(f"tokenizer.decode out of bounds: {tokenizer.decode([100])}")

tokenizer.parse: ['c', 'a', 't', 'e', 'r', 'p', 'i', 'l', 'l', 'a', 'r']
tokenizer.encode: [2, 0, 19, 4, 17, 15, 8, 11, 11, 0, 17]
tokenizer.decode: caterpillar
tokenizer.encode/decode unknown: <UNK>
tokenizer.decode out of bounds: <UNK>


In [168]:
class Embedding:
    def __init__(self, n_embd: int, d_embd: int):
        self.lookup = torch.randn(n_embd, d_embd)

    def forward(self, input: Tensor) -> Tensor:
        """Perform a lookup for the given indices in the embedding table."""
        if input.max() >= self.lookup.size(0) or input.min() < 0:
            raise ValueError("Input tensor contains invalid indices for lookup table.")
        return self.lookup[input,:]
        
    def __call__(self, input: Tensor) -> Tensor:
        return self.forward(input)

In [180]:
embedding_layer = Embedding(len(vocab), 3)
input = torch.tensor(tokenizer.encode(tokenizer.parse('caterpillar')))
result = embedding_layer(input)

print(f"input ({input.size()}):\n{input}\n")
print(f"embedding_layer result ({result.size()}):\n{result}")

input (torch.Size([11])):
tensor([ 2,  0, 19,  4, 17, 15,  8, 11, 11,  0, 17])

embedding_layer result (torch.Size([11, 3])):
tensor([[-0.4900, -1.4057, -0.5188],
        [ 2.4123,  0.7004,  1.8279],
        [ 0.2239,  0.3873, -1.4005],
        [ 0.1093, -1.2144,  3.0684],
        [-0.4954, -0.2158,  1.0090],
        [-0.1661,  0.2986,  0.3961],
        [ 0.3524,  0.7499,  0.7890],
        [-0.4177,  0.8679,  1.6980],
        [-0.4177,  0.8679,  1.6980],
        [ 2.4123,  0.7004,  1.8279],
        [-0.4954, -0.2158,  1.0090]])


#### Solution Example Advanced

In [None]:
class Embedding:
    def __init__(self, n_embd: int, d_embd: int):
        self.lookup = torch.randn(n_embd, d_embd, requires_grad=True)

    def forward(self, input: Tensor) -> Tensor:
        """Perform a lookup for the given indices in the embedding table."""
        if input.max() >= self.lookup.size(0) or input.min() < 0:
            raise ValueError("Input tensor contains invalid indices for lookup table.")
        return self.lookup[input,:]
        
    def __call__(self, input: Tensor) -> Tensor:
        return self.forward(input)

    def parameters(self) -> Iterator[Tensor]:
        """Return an iterator over the parameters."""
        yield self.lookup

In [124]:
embedding_layer.parameters()

<generator object Embedding.parameters at 0x169adef80>

In [135]:
def sgd(parameters, lr=0.01):
    with torch.no_grad():
        for param in parameters:
            param -= lr * param.grad
            param.grad.zero_()

In [136]:
loss = outputs.sum()
loss.backward()
sgd(embedding_layer.parameters())