In [1]:
from model import GPT, GPTConfig
import torch
import tiktoken
import time

device = "cuda"
model_name = "gpt2"

# Tokenizer
tokenizer = tiktoken.get_encoding(model_name)

def encode(string):
    return tokenizer.encode(string, allowed_special={"<|endoftext|>"})

def decode(string):
    return tokenizer.decode(string)

def encode_tensor(string, device):
    return torch.tensor(encode(string), dtype=torch.long, device=device)

In [2]:
# nanogpt model (master branch)
#model = GPT.from_pretrained(model_name)

In [4]:
# baseline model (master branch)
model = GPT(GPTConfig(vocab_size=50257))
model.load_state_dict(torch.load('models/gpt2.pth'))
model.to(device, dtype=torch.bfloat16)
model.eval()

prompt = "Hello, my name is Martin"
prompt_tensor = torch.tensor(encode(prompt), dtype=torch.long, device=device).unsqueeze(0)

#run generation
start_time = time.time()
completion = model.generate(prompt_tensor, 10, greedy=True)
end_time = time.time()

completion_tokens = completion[0].cpu().tolist()
num_tokens = len(completion_tokens)

time_taken = end_time - start_time
tokens_per_second = (num_tokens-len(prompt_tensor.squeeze(0))) / time_taken

print(decode(completion_tokens))
print(f"Number of tokens: {num_tokens}")
print(f"Time taken: {time_taken:.2f} seconds")
print(f"Tokens per second: {int(tokens_per_second)}")
print(completion)

#float32
#Hello, my name is Martin. I'm a writer and a writer's assistant at the New York Times. I'm a writer and a writer's assistant at the New York Times. I'm a writer and a writer's assistant at the New York Times. I'm a writer and a writer's assistant at the New York Times. I'm a writer and a writer's assistant at the New York Times. I'm a writer and a writer's assistant at the New York Times. I'm a writer and a writer's assistant at the New York Times. I'm a writer and a writer's assistant at the New York Times. I'm a writer and a writer's assistant at the New York Times. I'm a writer and a writer's assistant at the New York Times. I'm a writer and a writer's assistant at the New York Times. I'm a writer and a writer's assistant at the New York Times. I'm a writer and a writer's assistant at the New York Times. I'm a writer and a writer's assistant at the New York Times. I'm a writer and a writer's assistant at the New York Times. I'm a writer and a writer's assistant at the New York Times. I'm a writer and a writer's assistant at the New York Times.
#bfloat16
#Hello, my name is Martin. I am a member of the United States Army, and I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am a member of the United States Army Reserve. I am

#baseline is 200 tokens per second (210 int8), 180 with sampling (160 int8)


AttributeError: 'GPT' object has no attribute 'generate'

In [2]:
from generate import load_model

precision = torch.bfloat16
device="cuda"
model_path = "models/gpt2.pth"
model = load_model(model_path, device, precision, strict=True)

#model_path_8int = "models/gpt2-int8.pth"
#model = load_model(model_path_8int, device, precision, strict=False)

prompt = "Wikis are enabled by wiki software, otherwise known as wiki engines. A wiki engine, being a form of a content management system, differs from other web-based systems such as blog software or static site generators, in that the content is created without any defined owner or leader, and wikis have little inherent structure, allowing structure to emerge according to the needs of the users."
prompt_tensor = encode_tensor(prompt, device)


number of parameters: 123.65M


In [3]:
from generate import run_generation
#https://github.com/huggingface/transformers/issues/25420

run_generation(prompt_tensor,
               model, 
               250, 
               prompt_lookup=True,
               compile=False, 
               compile_prefill=False,
               temperature=0.7,
               top_k=100)

Number of tokens: 250
Time taken: 864.56 seconds
Tokens per second: 0


(tensor([33010,   271,   389,  9343,   416, 22719,  3788,    11,  4306,  1900,
           355, 22719, 11874,    13,   317, 22719,  3113,    11,   852,   257,
          1296,   286,   257,  2695,  4542,  1080,    11, 24242,   422,   584,
          3992,    12,  3106,  3341,   884,   355,  4130,  3788,   393,  9037,
          2524, 27298,    11,   287,   326,   262,  2695,   318,  2727,  1231,
           597,  5447,  4870,   393,  3554,    11,   290, 47145,   271,   423,
          1310, 11519,  4645,    11,  5086,  4645,   284, 14740,  1864,   284,
           262,  2476,   286,   262,  2985,    13,   198,   198,  6425,  1165,
           326,   612,   389,   867,  1180,  5107,   286, 47145,   271,   290,
           484,   477, 13238,  4622,   287,  2846,   286,  2846,   286,   703,
           511,  1321,   318,  5257,    11,   355,   880,   355,   287,   262,
          2695,   345,   460,  4370,    13,  1881,  4465,  7989,   318,   284,
         32781,   530,   355,   257, 22719,  5464,  

In [13]:
output, metrics = run_generation(prompt_tensor, 
                        model, 
                        250, 
                        prompt_lookup=True,
                        compile=True, 
                        compile_prefill=True,
                        temperature=0.7,
                        top_k=50)

prompt_len = len(prompt_tensor)

print(decode(output.cpu().tolist()[prompt_len:]))
print(metrics)

Number of tokens: 250
Time taken: 0.66 seconds
Tokens per second: 378


All content is shared, but not necessarily indexed, and Wikis are expected to provide information as to the content's status and status as it is published, but not necessarily when it is published.

Wikis are self-contained and can be accessed, edited, or viewed from any computer.

Wikis are the most popular in the world, but are also fairly unknown in the United States, Europe, and Asia.

Wikis are based on the Wikia language, which is the official document of all Wikis, and are used to generate, organize, and maintain the Wikia wiki and wiki files, and to support wiki maintenance and the maintenance of the wiki.

The Wikipedia wiki is hosted on the wiki server site. The wiki is not hosted on the internet, and the server is not accessed from anywhere.

Wikis are not limited to websites, but include the following:

The wiki server

the Wikipedia web site

the Wikipedia Wikian

The wiki is hosted on the wiki server 

In [None]:
def find_candidate_pred_tokens(input_ids: torch.Tensor, max_ngram_size: int = 3, num_pred_tokens: int = 10) -> torch.Tensor:
    """
    Finds candidate prediction tokens based on the input_ids.

    Args:
        input_ids (torch.Tensor): The input tensor of shape (batch_size, seq_len) containing token IDs.
        max_ngram_size (int, optional): The maximum size of the n-gram to search for. Defaults to 3.
        num_pred_tokens (int, optional): The number of prediction tokens to return. Defaults to 10.

    Returns:
        torch.Tensor: The tensor containing the candidate prediction tokens.
    """
    input_length = input_ids.size(1)

    for ngram_size in range(max_ngram_size, 0, -1):
        # Extract the last n tokens as our search ngram
        ngram = input_ids[0, -ngram_size:].tolist()

        # Create sliding windows of size ngram_size
        windows = input_ids.unfold(dimension=1, size=ngram_size, step=1)

        # Convert ngram to a tensor for comparison
        ngram_tensor = torch.tensor(ngram, device=input_ids.device).unsqueeze(0)

        # Find where the windows match the ngram
        matches = (windows == ngram_tensor).all(dim=2)

        # Get the indices of matches
        match_indices = matches.nonzero(as_tuple=True)[1]

        # Iterate through match indices to find a valid continuation
        for idx in match_indices:
            start_idx = idx + ngram_size
            end_idx = start_idx + num_pred_tokens
            # Ensure we don't go beyond the length of input_ids and avoid self-match
            if end_idx <= input_length and start_idx < input_length - ngram_size:
                return input_ids[0, start_idx:end_idx]

    # If no match is found, return an empty tensor
    return torch.tensor([], dtype=torch.long, device=input_ids.device)



tensor([], dtype=torch.int64)

In [None]:
find_candidate_pred_tokens(prompt_tensor, max_ngram_size=3, num_pred_tokens=10)

tensor([], device='mps:0', dtype=torch.int64)

In [None]:
def test_find_candidate_pred_tokens():

    # Test Case 1: Matching ngram is found
    with_match = torch.tensor([1, 2, 3, 1, 2, 3, 1, 2, 3]).unsqueeze(0)
    result_with_match = find_candidate_pred_tokens(with_match, max_ngram_size=3, num_pred_tokens=3)
    assert torch.equal(result_with_match, torch.tensor([1, 2, 3]))

    # Test Case 2: Matching ngram is not found
    without_match = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8, 9]).unsqueeze(0)
    result_without_match = find_candidate_pred_tokens(without_match, max_ngram_size=3, num_pred_tokens=3)
    # For an empty result, ensure the result is an empty tensor of the expected shape or type
    assert torch.equal(result_without_match, torch.tensor([]))

test_find_candidate_pred_tokens()