In [None]:
import torch
import time
from transformers import AutoModelForCausalLM, AutoTokenizer

torch.set_default_device("cuda")

model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", torch_dtype="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)

inputs = tokenizer('''def print_prime(n):
   """
   Print all primes between 1 and n
   """''', return_tensors="pt", return_attention_mask=False)
start_time = time.time()
outputs = model.generate(**inputs, max_length=150)

end_time = time.time()
num_tokens_generated = outputs.shape[1]
generation_time = end_time - start_time
token_generation_speed = num_tokens_generated / generation_time
text = tokenizer.batch_decode(outputs)[0]

print(text)
print("Token generation speed: {:.2f} tokens per second.".format(token_generation_speed))