# Building Transformer

### Imports

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
from transformers.tokenization_utils_base import BatchEncoding

import pandas as pd
from typing import Dict, List

## Tokenizer

In [2]:
model_name = "aubmindlab/bert-large-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [3]:
model.config

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "dtype": "float32",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.57.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 64000
}

### Load Dataset

In [4]:
dataset = load_dataset("arbml/ArSAS")

In [5]:
text_corpus: List[str] = pd.DataFrame(dataset["train"])["Tweet_text"].tolist()

In [6]:
len(text_corpus)

19897

In [7]:
# Tokenization
text_tokens = tokenizer(text_corpus, truncation=True)

### Config

In [8]:
class config:
    batch_size = 10
    hidden_size = 768
    n_heads = 12
    n_layers = 4
    max_seq_len = 128
    vocab_size = tokenizer.vocab_size
    base = 10_000
    bias = True
    dropout = 0.01
    dropout = 0.01
    pad_id = 0
    cls_token = 33
    sep_token = 34
    ignored_index = -100
    criterion = nn.CrossEntropyLoss()

## Dataset & DataLoader

In [9]:
class NanoDataset(Dataset):

    def __init__(self, corpus: BatchEncoding):
        self.corpus_tokens = corpus["input_ids"]

    def __len__(self):
        return len(self.corpus_tokens)

    def __getitem__(self, idx):

        ids = torch.tensor(self.corpus_tokens[idx])
        
        if len(ids) < 2:
            ids = [config.cls_token, config.sep_token]

        x = ids[:-1]
        y = ids[1:]
        return {"input_ids": x, "labels": y}

In [10]:
def collate_fn(batch: List[Dict[str, torch.Tensor]]):
    """Takes list of items of the dataset

        i.e:
    >>> ds = NanoDataset(tokenized_corpus)
    >>> [ds[i] for i in ds]
    """
    input_ids = [b["input_ids"] for b in batch]
    labels = [b["labels"] for b in batch]

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=config.pad_id)
    labels = pad_sequence(labels, batch_first=True, padding_value=config.ignored_index)

    return {
        "input_ids": input_ids,
        "labels": labels,
        "attention_mask": (input_ids != config.pad_id).long(),
    }

In [11]:
ds = NanoDataset(text_tokens)

In [12]:
train_dataloader = DataLoader(
    dataset=ds,
    batch_size=config.batch_size,
    collate_fn=collate_fn,
    shuffle=True,
    )

In [13]:
for batch in train_dataloader:
    print(batch["input_ids"].shape, batch["labels"].shape, batch["attention_mask"].shape)
    break

torch.Size([10, 76]) torch.Size([10, 76]) torch.Size([10, 76])


## Modelling

### Attention Module

In [None]:
class CSAttention(nn.Module):
    # Casual Self Attention Module

    def __init__(self, config):
        super().__init__()
        # Softmax((Q @ KT)/sqrt(d) + M) V
        self.qkv = ...
        self.proj = ...

    def forward(self, x):
        # X: B T D
        q, k, v = ...
        out = ...
        # out: B T D
        return out 

### MLP Module

In [None]:
class MLP(nn.Module):

    def __init__(self, config):
        super().__init__()
        # Linear Layer D x 4D 
        # + GeLU 
        # + Linear Layer 4D x D 
        # + Dropout

    def forward(self, x):
        # X: B T D
        
        # X: B T D
        return x

### Block

In [None]:
class Block(nn.Module):

    def __init__(self, config):
        super().__init__()
        # LayerNorm 
        # + Attention 
        # + LayerNorm 
        # + MLP 
        # + Residul
    
    
    def forward(self, x):
        # X: B T D
        
        # X: B T D
        return x

### GPT

In [None]:
class GPT(nn.Module):

    def __init__(self, config):
        super().__init__()
        # transformer -> dict
        #   - wte: word-embedding (V, D)
        #   - wpe: pos-embedding (T, D)
        #   - drp: Dropout
        #   - h: -> list
        #       -  Blocks x n_layers
        #   - ln_f: LayerNorm
        # lm_head: D x V (tied)
        #

    def forward(self, x):
        # X: B T D

        # X: B T D
        return x

    @torch.no_grad()
    def generate(
        self,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor,
        max_new_tokens: int,
        temperature: float,
        top_k: int,
        eos: int,
    ):
        """
        input_ids: B x T
        """
        for _ in range(max_new_tokens):
            # Generate Logits
            logits = self.forward(input_ids, attention_mask) 
            # B T V
            last_token_logits = ... # B 1 V
            # Pick the top_k tokens with highest prob
            if top_k:
                ...
            # Sampling
            if temperature > 0:
                ...
            
            props = ...
            last_token = ...

            # Next Step preparation
            input_ids = ...
            attention_mask = ...

            if eos is not None: