### 2023.10.19 - Introduction to Transformers | Homework 1
In this exercise, you will implement your own character-based Tokenizer as well as an Embedding layer from scratch.

_Task 1: Character-based Tokenizer:_
- Implement a basic character-based tokenizer. Ensure to include a special <UNK> ("unknown") token to handle characters outside your vocabulary.
  - The tokenizer should be capable of:
    - Parsing a string into a list of characters.
    - Encoding a list of characters into their corresponding indices in the vocabulary.
    - Decoding a list of indices back into a string.
    - When encoding, return the token ID for <UNK> for any character not in the vocabulary. Similarly, when decoding, return the <UNK> token for any unknown token ID.
- Initialize your vocabulary with a list of unique characters. Consider common punctuation, numbers, and alphabets for a start. Your initial vocabulary should at least include lowercase English alphabets (a-z), digits (0-9), and common punctuation marks (e.g., ., !, ?).

_Task 2 - Embedding Layer:_
- Implement embedding layer from scratch. This layer should be able to:
    - Initialize an embedding table with random values.
    - Look up and return embeddings for a given list of indices.
    - Handle potential out-of-bounds errors when looking up embeddings.


In [23]:
import torch
from torch import Tensor
from typing import List, Iterator

In [3]:
class Tokenizer:
    def __init__(self, vocab: List[str]):
        # Add <UNK> token if it's not already in the vocabulary
        pass

    def parse(self, input: str) -> List[str]:
        """Convert a string to a list of characters."""
        pass

    def encode(self, tokens: List[str]) -> List[int]:
        """Encode a list of tokens into their corresponding indices."""
        pass

    def decode(self, indices: List[int]) -> str:
        """Decode a list of indices back into a string."""
        pass

In [4]:
class Embedding:
    def __init__(self, n_embd: int, d_embd: int):
        pass

    def forward(self, input: Tensor) -> Tensor:
        """Perform a lookup for the given indices in the embedding table."""
        pass
        
    def __call__(self, input: Tensor) -> Tensor:
        return self.forward(input)

In [43]:
# A helper function to assert a function call throws an exception
def assert_raises(fn, *args, **kwargs):
    try:
        fn(*args, **kwargs)
    except Exception as e:
        print(f"Expected error occurred: {type(e).__name__} - {e}")
        return
    raise AssertionError("Expected error did not occur")

In [38]:
vocab = list('') # define your vocabulary here
tokenizer = Tokenizer(vocab)

print("============ Tokenizer")
# Test parsing
tokens = tokenizer.parse('caterpillar')
print(f"tokenizer.parse: {tokens}")

# Test encoding
token_ids = tokenizer.encode(tokens)
print(f"tokenizer.encode: {token_ids}")

# Test decoding
print(f"tokenizer.decode: {tokenizer.decode(token_ids)}")

# Test <UNK>
print(f"tokenizer.encode/decode unknown: {tokenizer.decode(tokenizer.encode(['$']))}")
print(f"tokenizer.decode out of bounds: {tokenizer.decode([100])}\n")

# Test the embedding layer
print("============ Embedding Layer")

n_embd = len(vocab)
d_embd = 3
embedding_layer = Embedding(n_embd, d_embd)

input_tensor = torch.tensor(tokenizer.encode(tokenizer.parse('caterpillar')))
result = embedding_layer(input_tensor)

print(f"input ({input_tensor.size()}):\n{input_tensor}\n")
print(f"embedding_layer result ({result.size()}):\n{result}")

# Assure layer throws exception on invalid index
assert_raises(embedding_layer, torch.tensor([n_embd]))

tokenizer.parse: ['c', 'a', 't', 'e', 'r', 'p', 'i', 'l', 'l', 'a', 'r']
tokenizer.encode: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
tokenizer.decode: <UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK><UNK>
tokenizer.encode/decode unknown: <UNK>
tokenizer.decode out of bounds: <UNK>

input (torch.Size([11])):
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

embedding_layer result (torch.Size([11, 3])):
tensor([[ 1.1413, -1.2545, -1.0531],
        [ 1.1413, -1.2545, -1.0531],
        [ 1.1413, -1.2545, -1.0531],
        [ 1.1413, -1.2545, -1.0531],
        [ 1.1413, -1.2545, -1.0531],
        [ 1.1413, -1.2545, -1.0531],
        [ 1.1413, -1.2545, -1.0531],
        [ 1.1413, -1.2545, -1.0531],
        [ 1.1413, -1.2545, -1.0531],
        [ 1.1413, -1.2545, -1.0531],
        [ 1.1413, -1.2545, -1.0531]])
Expected error occurred: ValueError - Input tensor contains invalid indices for lookup table.


### Solution Example Simple

In [24]:
import torch
from torch import Tensor
from typing import List, Iterator

In [39]:
class Tokenizer:
    def __init__(self, vocab: List[str]):
        # Ensure the '<UNK>' token is in the vocabulary
        if '<UNK>' not in vocab:
            vocab.append('<UNK>')

        self.vocab = vocab
        self.token2idx = {token: idx for idx, token in enumerate(vocab)}
        self.idx2token = {idx: token for idx, token in enumerate(vocab)}

    def parse(self, input: str) -> List[str]:
        """Convert a string to a list of characters."""
        return list(input)

    def encode(self, tokens: List[str]) -> List[int]:
        """Encode a list of tokens into their corresponding indices."""
        return [self.token2idx.get(token, self.token2idx.get('<UNK>', None)) for token in tokens]

    def decode(self, indices: List[int]) -> str:
        """Decode a list of indices back into a string."""
        return ''.join([self.idx2token.get(idx, '<UNK>') for idx in indices])

In [40]:
class Embedding:
    def __init__(self, n_embd: int, d_embd: int):
        self.lookup = torch.randn(n_embd, d_embd)

    def forward(self, input: Tensor) -> Tensor:
        """Perform a lookup for the given indices in the embedding table."""
        if input.max() >= self.lookup.size(0) or input.min() < 0:
            raise ValueError("Input tensor contains invalid indices for lookup table.")
        return self.lookup[input,:]
        
    def __call__(self, input: Tensor) -> Tensor:
        return self.forward(input)

In [42]:
vocab = list("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz,!.;?")
tokenizer = Tokenizer(vocab)

print("============ Tokenizer")
# Test parsing
tokens = tokenizer.parse('caterpillar!')
print(f"tokenizer.parse: {tokens}")

# Test encoding
token_ids = tokenizer.encode(tokens)
print(f"tokenizer.encode: {token_ids}")

# Test decoding
print(f"tokenizer.decode: {tokenizer.decode(token_ids)}")

# Test <UNK>
print(f"tokenizer.encode/decode unknown: {tokenizer.decode(tokenizer.encode(['$']))}")
print(f"tokenizer.decode out of bounds: {tokenizer.decode([100])}\n")

# Test the embedding layer
print("============ Embedding Layer")

n_embd = len(vocab)
d_embd = 3
embedding_layer = Embedding(n_embd, d_embd)

input_tensor = torch.tensor(tokenizer.encode(tokenizer.parse('caterpillar')))
result = embedding_layer(input_tensor)

print(f"input ({input_tensor.size()}):\n{input_tensor}\n")
print(f"embedding_layer result ({result.size()}):\n{result}")

assert_raises(embedding_layer, torch.tensor([n_embd]))

tokenizer.parse: ['c', 'a', 't', 'e', 'r', 'p', 'i', 'l', 'l', 'a', 'r', '!']
tokenizer.encode: [28, 26, 45, 30, 43, 41, 34, 37, 37, 26, 43, 53]
tokenizer.decode: caterpillar!
tokenizer.encode/decode unknown: <UNK>
tokenizer.decode out of bounds: <UNK>

input (torch.Size([11])):
tensor([28, 26, 45, 30, 43, 41, 34, 37, 37, 26, 43])

embedding_layer result (torch.Size([11, 3])):
tensor([[-5.1405e-01,  2.7522e+00, -4.2396e-01],
        [-8.2343e-02, -7.0187e-02,  7.0366e-01],
        [ 7.0164e-01,  1.2855e+00, -1.2115e+00],
        [-2.6335e-01,  3.0554e-01,  1.9410e-01],
        [-7.7179e-01,  6.3235e-01, -3.4200e-01],
        [ 9.2339e-01, -2.9262e-01,  4.7156e-01],
        [-3.8477e-02,  6.6269e-01, -1.0486e+00],
        [ 1.9154e-01, -2.1600e-03, -5.6658e-01],
        [ 1.9154e-01, -2.1600e-03, -5.6658e-01],
        [-8.2343e-02, -7.0187e-02,  7.0366e-01],
        [-7.7179e-01,  6.3235e-01, -3.4200e-01]])
Expected error occurred: ValueError - Input tensor contains invalid indices for 