In [45]:
import torch
import numpy as np
import torch.nn as nn
import os
import math
import torch.nn.functional as F 

In [98]:
class SciFiConfig:
    vocab_size: int = 100277  # cl100k-base
    n_embd: int = 768  # GPT-2
    
class MLP(nn.Module):
    def __init__(self, config) -> None:
        super().__init__()
        self.config = config
    
        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
        self.fc = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.gelu = nn.GELU()
        self.proj = nn.Linear(4 * config.n_embd, config.vocab_size)
        
        self.apply(self._init_weights)
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.xavier_uniform_(module.weight)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.uniform_(module.weight, -1.0, 1.0)
    
    def forward(self, idx, targets=None):
        #B, T = idx.shape
        tok_emb = self.wte(idx) # (B, T, n_embd)
        logits = self.proj(self.gelu(self.fc(tok_emb)))
        
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.shape[-1]), targets.view(-1), ignore_index=-1)
        return logits, loss

# ---------------------------------------------------------------------------------
import tiktoken
import numpy as np

def load_tokens(filename):
    tokens = np.loadtxt(filename, dtype=np.int32)
    tokens = torch.tensor(tokens, dtype=torch.long)
    return tokens

class DataLoaderSciFi:
    def __init__(self, B, T, split=None):
        self.B = B
        self.T = T
        
        # get filename of dataset
        self.data_dir = 'data'
        self.tokens_filename = 'tokens.txt'
        self.tokens_path = os.path.join(self.data_dir, self.tokens_filename)
        
        self.reset()
    
    def reset(self):
        self.tokens = load_tokens(self.tokens_path)
        self.cur_pos = 0
    
    def next_batch(self):
        B, T = self.B, self.T
        cur_pos = self.cur_pos
        buf = self.tokens[cur_pos : cur_pos + B * T + 1]
        x = buf[: B * T].view(B, T)
        y = buf[1: ].view(B, T)
        return x, y    

In [36]:
# import tiktoken

# enc = tiktoken.get_encoding('cl100k_base')
# text = "Hello scientific fiction!"
# tokens = torch.tensor(enc.encode(text))
# targets = torch.cat((tokens[1:], torch.tensor([-1])), dim=-1)

# print(f"Encoded text: {tokens}")
# print(f"Targets: {targets}")
# print(f"The vocab_size of cl100k_base is {enc.n_vocab}.")

Encoded text: tensor([ 9906, 12624, 17422,     0])
Targets: tensor([12624, 17422,     0,    -1])
The vocab_size of cl100k_base is 100277.


In [99]:
B = 4
T = 16
loader = DataLoaderSciFi(B=B, T=T)
x, y = loader.next_batch()
#print(f'x: {x}')
#print(f'y: {y}')

model = MLP(SciFiConfig)

logits, loss = model(x, y)
print(f'The shape of pred: {logits.shape}.')
print(f'loss={loss:.4f}')

The shape of pred: torch.Size([4, 16, 100277]).
loss=11.5188
