In [1]:
import numpy as np

import transformers
import torch 
import torch.nn as nn

from tqdm.notebook import tqdm
from typing import Any, Callable, List, Optional
from torch import Tensor
from transformers import AutoTokenizer

from datasets import load_dataset

## Import Dataset

simple wikipedia

In [53]:
simple_wiki_dataset = load_dataset('wikipedia', "20220301.simple") # only contains training set since it's lanaugage modeling

print(f"Dataset contains {len(simple_wiki_dataset['train']['title'])} titles")


# take text from first 1000 titles 
training_text = simple_wiki_dataset['train']['text'][:1000]

# get unique characters from text
unique_chars = []
for text in tqdm(training_text):
    for char in text:
        if char not in unique_chars:
            unique_chars.append(char)

Dataset contains 205328 titles


  0%|          | 0/1000 [00:00<?, ?it/s]

In [55]:
len(unique_chars)

472

In [52]:
# [(i, len(x)) for i, x in enumerate(text)]
text[917]

'A thousand (1000, one thousand or 1,000) is the natural number after 999 and before 1001. One thousand thousands is known as a million.\n\nIn Roman numerals, 1000 is written as M.\n\nExamples of a thousand \n The number of grams in a kilogram\n The number of millimeters in a meter\n The number of years in a millennium\n\n19E03 1000'

## Preprocess dataset
Tokenization

In [None]:
# character level encoding and decoding
class CharacterTokenizer:
    def __init__(self, vocab: List[str]):
        self.vocab = vocab
        self.char2idx = {char: idx for idx, char in enumerate(self.vocab)}
        self.idx2char = {idx: char for idx, char in enumerate(self.vocab)}
    
    def encode(self, text: str) -> List[int]:
        return [self.char2idx[char] for char in text]
    
    def decode(self, tokens: List[int]) -> str:
        return ''.join([self.idx2char[idx] for idx in tokens])

# create a character tokenizer


In [37]:
# # BERT tokenizer
# tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# # tokenize dataset
# tokenized_dataset = simple_wiki_dataset.map(
#     lambda x: tokenizer(x["text"], truncation=True, padding="max_length", max_length=512),
#     batched=True,
#     remove_columns=["text"],
# )

Map: 100%|██████████| 205328/205328 [01:04<00:00, 3203.58 examples/s]


## Transformer

### Self attention

In [None]:
class SelfAttention(nn.Module):
    def __init__(self, input_dim=512, embed_dim=64):
        super().__init__()

        self.d_k = embed_dim

        self.Wq = nn.Linear(input_dim, self.d_k, bias=False)
        self.Wk = nn.Linear(input_dim, self.d_k, bias=False)
        self.Wv = nn.Linear(input_dim, self.d_k, bias=False)

        self.softmax = nn.Softmax(dim=1)

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor of shape (B, T, C)
        Returns:
            attention: Tensor of shape (B, T, C)
        """
        q = self.Wq(x)  # d_k x n
        k = self.Wk(x)
        v = self.Wv(x)

        # scaled dot product attention

        attention = self.softmax(q @ k.T / self.d_k**0.5) * v

        return attention

### Multihead Attention

### Feedforward network

### Encoder

### Decoder

### Positional embeddings

## Test