In [1]:
import torch
from tokenizer import create_tokenizer
from model import GPTTransformer
from utils import Config
import torch.nn.functional as F

# Load Model

In [3]:
# create the encoder
tokenizer = create_tokenizer("./data/dostoyevsky.vocab", "./data/dostoyevsky.bpe")

# create the config
config = Config(
    epoch=1,
    learning_rate=1e-3,
    batch_size=64,
    weight_decay=1e-5,
    seq_len=128,
    d_embed=192,
    n_layers=6,
    n_heads=6,
    dropout=0.1,
    vocab_size=len(tokenizer.encoder),
)

# load the model from the last checkpoint
model = GPTTransformer.load_from_checkpoint("./dostoyevsky/3q0vztkc/checkpoints/epoch=4-step=15220.ckpt", config=config)

number of parameters: 5.52M


In [4]:
model.generate_samples('Hello', )

Generating dostoyevsky samples...


'Hellosblaecind ssthe ppis\n--------------------------------------------------------------------------------\n'

In [5]:
@torch.no_grad()
def generate(model, idx, config, max_new_tokens, temperature=1.0, do_sample=False, top_k=None):
    """
    Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
    the sequence max_new_tokens times, feeding the predictions back into the model each time.
    Most likely you'll want to make sure to be in model.eval() mode of operation for this.
    """

    model.eval()

    for _ in range(max_new_tokens):
        # if the sequence context is growing too long we must crop it at block_size
        idx_cond = idx if idx.size(0) <= config.seq_len else idx[:, -config.seq_len :]
        # forward the model to get the logits for the index in the sequence
        logits = model(idx_cond)
        # pluck the logits at the final step and scale by desired temperature
        logits = logits[:, -1, :] / temperature
        # optionally crop the logits to only the top k options
        if top_k is not None:
            v, _ = torch.topk(logits, top_k)
            logits[logits < v[:, [-1]]] = -float("Inf")
        # apply softmax to convert logits to (normalized) probabilities
        probs = F.softmax(logits, dim=-1)
        # either sample from the distribution or take the most likely element
        if do_sample:
            idx_next = torch.multinomial(probs, num_samples=1)
        else:
            _, idx_next = torch.topk(probs, k=1, dim=-1)
        # append sampled index to the running sequence and continue
        idx = torch.cat((idx, idx_next), dim=1)
    return idx

output = generate(model, torch.tensor([tokenizer.encode("Hello there ")]), config, 64)
print(output)
text = tokenizer.decode(output.squeeze().tolist())
print(text)

tensor([[ 72, 101, 108, 108, 111,  32, 116, 104, 101, 114, 101,  32, 116, 104,
         101,  32, 116, 104, 101,  32, 116, 104, 101,  32, 116, 104, 101,  32,
         116, 104, 101,  32, 116, 104, 101,  32, 116, 104, 101,  32, 116, 104,
         101,  32, 116, 104, 101,  32, 116, 104, 101,  32, 116, 104, 101,  32,
         116, 104, 101,  32, 116, 104, 101,  32, 116, 104, 101,  10, 116, 104,
         101,  32, 116, 104, 101,  32]])
Hello there the the the the the the the the the the the the the the
the the 
