In [1]:
import torch
import numpy as np
import time
import torch.nn as nn
import os
import math
import torch.nn.functional as F 

In [2]:
class SciFiConfig:
    vocab_size: int = 100277  # cl100k-base
    n_embd: int = 768  # GPT-2
    
class MLP(nn.Module):
    def __init__(self, config) -> None:
        super().__init__()
        self.config = config
    
        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
        self.fc = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.gelu = nn.GELU()
        self.proj = nn.Linear(4 * config.n_embd, config.vocab_size)
        
        self.apply(self._init_weights)
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.xavier_uniform_(module.weight)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.uniform_(module.weight, -1.0, 1.0)
    
    def forward(self, idx, targets=None):
        #B, T = idx.shape
        tok_emb = self.wte(idx) # (B, T, n_embd)
        logits = self.proj(self.gelu(self.fc(tok_emb)))
        
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.shape[-1]), targets.view(-1), ignore_index=-1)
        return logits, loss
    
    def optimizer(self, learning_rate):
        optimizer = torch.optim.AdamW(self.parameters(), lr=learning_rate, betas=(0.9, 0.95), eps=1e-8)
        return optimizer

# ---------------------------------------------------------------------------------
import tiktoken
import numpy as np

def load_tokens(filename):
    tokens = np.loadtxt(filename, dtype=np.int32)
    tokens = torch.tensor(tokens, dtype=torch.long)
    return tokens

class DataLoaderSciFi:
    def __init__(self, B, T, split=None):
        self.B = B
        self.T = T
        
        # get filename of dataset
        self.data_dir = 'data'
        self.tokens_filename = 'tokens.txt'
        self.tokens_path = os.path.join(self.data_dir, self.tokens_filename)
        
        self.reset()
    
    def reset(self):
        self.tokens = load_tokens(self.tokens_path)
        self.cur_pos = 0
    
    def next_batch(self):
        B, T = self.B, self.T
        buf = self.tokens[self.cur_pos : self.cur_pos + B * T + 1]
        x = buf[: B * T].view(B, T)
        y = buf[1: ].view(B, T)
        self.cur_pos = self.cur_pos + B * T 
        return x, y    

In [3]:
# import tiktoken

# enc = tiktoken.get_encoding('cl100k_base')
# text = "Hello scientific fiction!"
# tokens = torch.tensor(enc.encode(text))
# targets = torch.cat((tokens[1:], torch.tensor([-1])), dim=-1)

# print(f"Encoded text: {tokens}")
# print(f"Targets: {targets}")
# print(f"The vocab_size of cl100k_base is {enc.n_vocab}.")

In [5]:
device = "cpu"
if torch.cuda.is_available():
    device = "cuda"
print(f"using device: {device}")

device_type = "cuda" if device.startswith("cuda") else "cpu"

B = 64
T = 64
max_steps = 6847 # 28046749 tokens / (B * T)
max_lr = 3e-4
min_lr = max_lr * 0.1
warmup_steps = 715

def get_lr(step):
    if step < warmup_steps:
        return max_lr * (step + 1) / warmup_steps
    if step > max_steps:
        return min_lr
    decay_ratio = (step - warmup_steps) / (max_steps - warmup_steps)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return min_lr + coeff * (max_lr - min_lr)

loader = DataLoaderSciFi(B=B, T=T)
model = MLP(SciFiConfig)
model.to(device)
model = torch.compile(model)
optimizer = model.optimizer(learning_rate=2e-3)

torch.manual_seed(2024)

for step in range(1000):
    t0 = time.time()
    x, y = loader.next_batch()
    x, y = x.to(device), y.to(device)
    optimizer.zero_grad()
    with torch.autocast(device_type=device_type, dtype=torch.bfloat16):
        logits, loss = model(x, y)
    loss.backward()
    
    lr = get_lr(step)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    optimizer.step()
    t1 = time.time()
    dt = t1 - t0

    if step % 100 == 0:
        print(f'step {step} duration: {dt*1000:.2f}ms loss={loss:.4f}')

using device: cuda
step 0 duration: 542.65ms loss=11.5170
step 100 duration: 104.33ms loss=10.8140
step 200 duration: 104.66ms loss=8.4314
step 300 duration: 105.02ms loss=6.1556
step 400 duration: 104.98ms loss=8.1009
step 500 duration: 104.59ms loss=6.6083
step 600 duration: 104.75ms loss=5.9500
step 700 duration: 104.50ms loss=6.4254
step 800 duration: 104.55ms loss=5.7874
step 900 duration: 104.67ms loss=5.8936


#### duration for one step:
B = 64 \
T = 64 \
max_steps = 6847 # 28046749 tokens / (B * T) \
max_lr = 3e-4 \
min_lr = max_lr * 0.1 \
warmup_steps = 715 \

initially: 190 ms \
with autocast to torch.bfloat16: 110 ms \
with torch.compile: 104 ms \

The lowest loss can be achieved is ~5.8. 

In [6]:
# generate a sample after 1000 iterations
import tiktoken

max_length = 128
num_samples = 4
enc = tiktoken.get_encoding('cl100k_base')

tokens = enc.encode("In 2024, a boy was sitting in front of a park.")
tokens = torch.tensor(tokens, dtype=torch.long)
tokens = tokens.unsqueeze(0).repeat(num_samples, 1) # (4, n_tokens)
gen_idx = tokens.to(device)

sample_rng = torch.Generator(device=device)
sample_rng.manual_seed(2028)

while gen_idx.size(1) < max_length:
     logits, _ = model(gen_idx) # logits (num_samples, vocab_size)
     logits = logits[:, -1, :]
     probs = F.softmax(logits, dim=-1) 
     top_probs, top_indices = torch.topk(probs, 50, dim=-1)
     ix = torch.multinomial(top_probs, 1, generator=sample_rng)
     out_ix = torch.gather(top_indices, -1, ix)
     gen_idx = torch.cat((gen_idx, out_ix), dim=1)

for i in range(num_samples):
     tokens = gen_idx[i, :].tolist()
     decoded = enc.decode(tokens)
     print(f"sample {i}: {decoded}")



sample 0: In 2024, a boy was sitting in front of a park. I should
stared, yes for one, I don't you can't know and looked about me, or not
the outer his
tremendous. Then
are a heavy, while that to them, I could not know exactly than I asked.
I'm going to the rest and I
sh, but also at all who had already, and we shall be that our
myself on his head for you to us, we have never know we do with his foot
the sky and to find that I'll to see," said,
sample 1: In 2024, a boy was sitting in front of a park. You'd. One as you know my own eyes
for the same time or I came down. They were the long after a lot of that at my surprise's eyes were
of the same time. I was gone
in which a moment--not a large
till at the ship, however, I've been, we were very
reached. There is only man said, and they had not to the whole
man, I had been so long ago, sir? Could. You know?"
"Come that was
man and
the great
sample 2: In 2024, a boy was sitting in front of a park. The world?'
, however, it on the black. I d