In [1]:
from model import GPT, GPTConfig
import torch
import tiktoken
import time

device = "cuda"
model_name = "gpt2"

# Tokenizer
tokenizer = tiktoken.get_encoding(model_name)

def encode(string):
    return tokenizer.encode(string, allowed_special={"<|endoftext|>"})

def decode(string):
    return tokenizer.decode(string)

def encode_tensor(string, device):
    return torch.tensor(encode(string), dtype=torch.long, device=device)

prompt = "Hello, my name is Martin"
prompt_tensor = torch.tensor(encode(prompt), dtype=torch.long, device=device).unsqueeze(0)


In [5]:
# nanogpt model (master branch)
#model = GPT.from_pretrained(model_name)

In [6]:
# baseline model
model = GPT(GPTConfig(vocab_size=50257))
model.load_state_dict(torch.load('models/gpt2.pth'))
model.to(device, dtype=torch.bfloat16)
model.eval()

number of parameters: 123.65M


KeyboardInterrupt: 

In [None]:
#run generation

start_time = time.time()
completion = model.generate(prompt_tensor, 10, greedy=True)
end_time = time.time()

completion_tokens = completion[0].cpu().tolist()
num_tokens = len(completion_tokens)

time_taken = end_time - start_time
tokens_per_second = (num_tokens-len(prompt_tensor.squeeze(0))) / time_taken

print(decode(completion_tokens))
print(f"Number of tokens: {num_tokens}")
print(f"Time taken: {time_taken:.2f} seconds")
print(f"Tokens per second: {int(tokens_per_second)}")
print(completion)
#float32
#Hello, my name is Martin. I'm a writer and a writer's assistant at the New York Times. I'm a writer and a writer's assistant at the New York Times. I'm a writer and a writer's assistant at the New York Times. I'm a writer and a writer's assistant at the New York Times. I'm a writer and a writer's assistant at the New York Times. I'm a writer and a writer's assistant at the New York Times. I'm a writer and a writer's assistant at the New York Times. I'm a writer and a writer's assistant at the New York Times. I'm a writer and a writer's assistant at the New York Times. I'm a writer and a writer's assistant at the New York Times. I'm a writer and a writer's assistant at the New York Times. I'm a writer and a writer's assistant at the New York Times. I'm a writer and a writer's assistant at the New York Times. I'm a writer and a writer's assistant at the New York Times. I'm a writer and a writer's assistant at the New York Times. I'm a writer and a writer's assistant at the New York Times. I'm a writer and a writer's assistant at the New York Times.
#bfloat16
#Hello, my name is Martin. I am a member of the United States Army, and I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am


Hello, my name is Martin. I am a member of the United States Army
Number of tokens: 16
Time taken: 0.06 seconds
Tokens per second: 178
tensor([[15496,    11,   616,  1438,   318,  5780,    13,   314,   716,   257,
          2888,   286,   262,  1578,  1829,  5407]], device='cuda:0')


In [None]:
#baseline is 200 tokens per second (210 int8), 180 with sampling (160 int8)

#bfloat16
logits=model(prompt_tensor)[0]
print(logits.shape)
print(logits[:, -1, :].shape)
torch.max(logits[:, -1, :], dim=-1, keepdim=True)

torch.Size([1, 1, 50257])
torch.Size([1, 50257])


torch.return_types.max(
values=tensor([[-56.7500]], device='cuda:0', dtype=torch.bfloat16,
       grad_fn=<MaxBackward0>),
indices=tensor([[13]], device='cuda:0'))

In [14]:
from generate import load_model

prompt_tensor = encode_tensor(prompt, device)

precision = torch.bfloat16
device="cuda"
model_path = "models/gpt2.pth"
model = load_model(model_path, device, precision)


number of parameters: 123.65M


In [3]:
from generate import generate
generate(model, torch.cat([prompt_tensor, torch.tensor([13], device=device)]), 10, draft_model=None)


(tensor([15496,    11,   616,  1438,   318,  5780,    13,   314,   716,   257,
          2888,   286,   262,  1578,  1829,  5407,    11], device='cuda:0'),
 {'accept_counts': [0, 0, 0, 0, 0, 0, 0, 0, 0]})

In [3]:
from generate import run_generation
#torch._dynamo.config.guard_nn_modules=True
#https://github.com/huggingface/transformers/issues/25420
run_generation(torch.cat([prompt_tensor, torch.tensor([13], device=device)]), 
               model, 
               100, 
               compile=True, 
               compile_prefill=False)



Number of tokens: 100
Time taken: 14.33 seconds
Tokens per second: 6


tensor([15496,    11,   616,  1438,   318,  5780,    13,   314,  1101,   716,
          198,     9,     9,     9,    10,    59,    11,    14,    11,    11,
           11,    11,    11,    11,    11,    11,   357,    25,    13,    11,
           26,    27,    27,    28,    28,    28,    13,    13,    21,    25,
           25,    28,    13,    41,    41,    13,    48,    41,    48,    41,
           41,    48,    41,    41,    48,    41,    41,    41,    41,    52,
           31,    31,    40,    55,    62,    62,    62,    28,    36,    28,
           38,    39,    25,    13,    74,    62,    44,    62,    46,    62,
           80,    81,    85,    62,    52,    86,    86,    55,    86,    89,
           58,    55,    62,    62,    53,    53,    58,    58,    30,    30,
           28,    28,    25,    25,    25,    13,    27], device='cuda:0')

In [43]:
from generate import run_generation

txt="Wikis are enabled by wiki software, otherwise known as wiki engines. A wiki engine, being a form of a content management system, differs from other web-based systems such as blog software or static site generators, in that the content is created without any defined owner or leader, and wikis have little inherent structure, allowing structure to emerge according to the needs of the users."
prompt_tensor = encode_tensor(txt, device)

x=run_generation(prompt_tensor, 
               model, 
               100, 
               compile=True, 
               compile_prefill=True)
print(decode(x.cpu().tolist()))

Number of tokens: 100
Time taken: 0.07 seconds
Tokens per second: 1338
Wikis are enabled by wiki software, otherwise known as wiki engines. A wiki engine, being a form of a content management system, differs from other web-based systems such as blog software or static site generators, in that the content is created without any defined owner or leader, and wikis have little inherent structure, allowing structure to emerge according to the needs of the users. This is evidenced by the numerous examples of websites that use Wikipedia as their website, and many other examples of articles, articles, or videos that are written by people who do not write for Wikipedia.

The importance of a wiki in creating a viable community

Wikipedia's existence is very much a part of what makes it a real asset to a community. Because of this, the Wikimedia Foundation has long been known for its commitment to building the best website for the history of the web, with a


In [None]:
x = run_generation(prompt_tensor, model, compile=True)
print(decode(x.cpu().tolist()))



Number of tokens: 256
Time taken: 14.74 seconds
Tokens per second: 17
Hello, my name is Martin.,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
from model import GPT
import tiktoken
import torch
import time

device = "mps"
model_name = "gpt2"

In [None]:
# Model
model = GPT.from_pretrained(model_name)
model.eval()
model.to(device)
model = torch.compile(model)

# Tokenizer
tokenizer = tiktoken.get_encoding(model_name)

def encode(string):
    return tokenizer.encode(string, allowed_special={"<|endoftext|>"})

def decode(string):
    return tokenizer.decode(string)

loading weights from pretrained gpt: gpt2
forcing vocab_size=50257, block_size=1024, bias=True
number of parameters: 123.65M




In [None]:
prompt = '''Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus eget purus metus. Nullam efficitur, sem ac facilisis tempor, urna quam pharetra tellus, ut mollis tellus quam vitae libero. Sed faucibus posuere varius. Morbi sit amet congue ex. Fusce vitae tellus sem. Donec dignissim hendrerit laoreet. Vestibulum mattis fringilla bibendum. Aenean pharetra felis libero, ac hendrerit libero porttitor sed. Suspendisse fermentum, ante sit amet faucibus tincidunt, libero quam mollis ex, non accumsan massa eros ac metus. Praesent leo risus, finibus at pretium et, venenatis ac ex. Sed posuere quam vitae turpis volutpat, vel volutpat augue dignissim. Donec hendrerit pretium mattis. Morbi tempus tellus at dolor ornare lobortis. Aenean tempor ligula cursus magna molestie venenatis. Ut feugiat semper lorem sed fringilla. Etiam pharetra, nisl a pellentesque iaculis, felis mi ultricies mi, quis feugiat elit erat at magna.

Summary:
'''

prompt_tensor = torch.tensor(encode(prompt), dtype=torch.long, device=device).unsqueeze(0)

#repeat sampling 6 times and save timings for 5 runs (in case it compiles on the first run)
times = []
texts = []
for _ in range(6):
    start = time.time()
    completion = model.generate_greedy(prompt_tensor, max_new_tokens=50)
    end = time.time()
    times.append(end-start)
    texts.append(decode(completion[0].cpu().tolist()))

assert all(text == texts[0] for text in texts[1:])

time_mean = torch.tensor(times[1:]).mean()
time_sd = torch.tensor(times[1:]).std()

print(f"Mean time (+/- SD): {time_mean:.2f} (+/- {time_sd:.2f})")

Mean time (+/- SD): 1.51 (+/- 0.00)


In [None]:
def find_candidate_pred_tokens(input_ids: torch.Tensor, max_ngram_size: int = 3, num_pred_tokens: int = 10) -> torch.Tensor:
    """
    Finds candidate prediction tokens based on the input_ids.

    Args:
        input_ids (torch.Tensor): The input tensor of shape (batch_size, seq_len) containing token IDs.
        max_ngram_size (int, optional): The maximum size of the n-gram to search for. Defaults to 3.
        num_pred_tokens (int, optional): The number of prediction tokens to return. Defaults to 10.

    Returns:
        torch.Tensor: The tensor containing the candidate prediction tokens.
    """
    input_length = input_ids.size(1)

    for ngram_size in range(max_ngram_size, 0, -1):
        # Extract the last n tokens as our search ngram
        ngram = input_ids[0, -ngram_size:].tolist()

        # Create sliding windows of size ngram_size
        windows = input_ids.unfold(dimension=1, size=ngram_size, step=1)

        # Convert ngram to a tensor for comparison
        ngram_tensor = torch.tensor(ngram, device=input_ids.device).unsqueeze(0)

        # Find where the windows match the ngram
        matches = (windows == ngram_tensor).all(dim=2)

        # Get the indices of matches
        match_indices = matches.nonzero(as_tuple=True)[1]

        # Iterate through match indices to find a valid continuation
        for idx in match_indices:
            start_idx = idx + ngram_size
            end_idx = start_idx + num_pred_tokens
            # Ensure we don't go beyond the length of input_ids and avoid self-match
            if end_idx <= input_length and start_idx < input_length - ngram_size:
                return input_ids[0, start_idx:end_idx]

    # If no match is found, return an empty tensor
    return torch.tensor([], dtype=torch.long, device=input_ids.device)



tensor([], dtype=torch.int64)

In [None]:
find_candidate_pred_tokens(prompt_tensor, max_ngram_size=3, num_pred_tokens=10)

tensor([], device='mps:0', dtype=torch.int64)

In [None]:
def test_find_candidate_pred_tokens():

    # Test Case 1: Matching ngram is found
    with_match = torch.tensor([1, 2, 3, 1, 2, 3, 1, 2, 3]).unsqueeze(0)
    result_with_match = find_candidate_pred_tokens(with_match, max_ngram_size=3, num_pred_tokens=3)
    assert torch.equal(result_with_match, torch.tensor([1, 2, 3]))

    # Test Case 2: Matching ngram is not found
    without_match = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8, 9]).unsqueeze(0)
    result_without_match = find_candidate_pred_tokens(without_match, max_ngram_size=3, num_pred_tokens=3)
    # For an empty result, ensure the result is an empty tensor of the expected shape or type
    assert torch.equal(result_without_match, torch.tensor([]))

test_find_candidate_pred_tokens()