In [45]:
import torch
import numpy as np
import torch.nn as nn
import os
import math
import torch.nn.functional as F 

In [71]:
class SciFiConfig:
    vocab_size: int = 100277  # cl100k-base
    n_embd: int = 768  # GPT-2
    
class MLP(nn.Module):
    def __init__(self, config) -> None:
        super().__init__()
        self.config = config
    
        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
        self.fc = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.gelu = nn.GELU()
        self.proj = nn.Linear(4 * config.n_embd, config.vocab_size)
        
        self.apply(self._init_weights)
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.xavier_uniform_(module.weight)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.uniform_(module.weight, -1.0, 1.0)
    
    def forward(self, idx, targets=None):
        #B, T = idx.shape
        tok_emb = self.wte(idx) # (B, T, n_embd)
        logits = self.proj(self.gelu(self.fc(tok_emb)))
        
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.shape[-1]), targets.view(-1), ignore_index=-1)
        return logits, loss

In [36]:
# import tiktoken

# enc = tiktoken.get_encoding('cl100k_base')
# text = "Hello scientific fiction!"
# tokens = torch.tensor(enc.encode(text))
# targets = torch.cat((tokens[1:], torch.tensor([-1])), dim=-1)

# print(f"Encoded text: {tokens}")
# print(f"Targets: {targets}")
# print(f"The vocab_size of cl100k_base is {enc.n_vocab}.")

Encoded text: tensor([ 9906, 12624, 17422,     0])
Targets: tensor([12624, 17422,     0,    -1])
The vocab_size of cl100k_base is 100277.


In [65]:
data_dir = 'data'
tokens_filename = 'tokens.txt'
tokens_path = os.path.join(data_dir, tokens_filename)

with open(tokens_path, 'r') as f:
    tokens = f.read()
    tokens = list(map(int, tokens.split()))
print(f'There are len(tokens) tokens in scifi dataset.')
# print out a few tokens
print(tokens[:100])

There are len(tokens) tokens in scifi dataset.
[791, 4212, 13257, 11, 555, 473, 13, 480, 13, 37958, 510, 9378, 23, 933, 40, 198, 791, 4212, 43359, 7218, 320, 2000, 779, 433, 690, 387, 17125, 311, 6604, 315, 1461, 340, 16514, 1367, 13900, 264, 312, 1321, 635, 5030, 311, 603, 13, 5414, 20366, 6548, 559, 606, 323, 198, 15930, 771, 839, 11, 323, 813, 6118, 28639, 3663, 574, 74820, 323, 11625, 13, 578, 198, 11029, 27724, 76389, 11, 323, 279, 8579, 12164, 685, 315, 279, 3709, 62452, 1189, 198, 14146, 304, 279, 326, 7751, 315, 15310, 10791, 279, 44783, 430, 70939, 323, 198, 36522, 304, 1057, 29247, 13]


In [67]:
tokens = torch.tensor(tokens)
print(tokens[:100])

tensor([  791,  4212, 13257,    11,   555,   473,    13,   480,    13, 37958,
          510,  9378,    23,   933,    40,   198,   791,  4212, 43359,  7218,
          320,  2000,   779,   433,   690,   387, 17125,   311,  6604,   315,
         1461,   340, 16514,  1367, 13900,   264,   312,  1321,   635,  5030,
          311,   603,    13,  5414, 20366,  6548,   559,   606,   323,   198,
        15930,   771,   839,    11,   323,   813,  6118, 28639,  3663,   574,
        74820,   323, 11625,    13,   578,   198, 11029, 27724, 76389,    11,
          323,   279,  8579, 12164,   685,   315,   279,  3709, 62452,  1189,
          198, 14146,   304,   279,   326,  7751,   315, 15310, 10791,   279,
        44783,   430, 70939,   323,   198, 36522,   304,  1057, 29247,    13])


  tokens = torch.tensor(tokens)


In [72]:
model = MLP(SciFiConfig)
B = 4
T = 25
idx = tokens[:100].view(B, T)
targets = tokens[1:101].view(B, T)
logits, loss = model(idx, targets)
print(f'The shape of pred: {logits.shape}.')
print(f'loss={loss:.4f}')

The shape of pred: torch.Size([4, 25, 100277]).
loss=11.5135
