# **Generate** `top_p`

In [None]:
def generate_top_p(model, tokenizer, prompt, n_rep=3, max_seq_len=128, T=0.9, top_p=0.9, device='cuda', seed=42):
    # Tokenize the prompt and convert it to a tensor on the specified device (e.g., GPU)
    inputs = torch.tensor(tokenizer.encode(prompt).ids, dtype=torch.int, device=device)  # Shape: [T]

    # Repeat the input prompt n_rep times to generate multiple sequences in parallel
    inputs = inputs.unsqueeze(0).repeat(n_rep, 1)  # Shape: [B, T] where B = n_rep

    # Set the model to evaluation mode
    model.eval()

    # Initialize a random number generator for sampling
    sample_rng = torch.Generator(device=device)
    sample_rng.manual_seed(seed)

    # Disable gradient calculation for faster inference
    with torch.no_grad():
        # Continue generating tokens until reaching the maximum sequence length
        while inputs.shape[-1] < max_seq_len:
            # Forward pass: get logits from the model
            logits = model(inputs)  # Shape: [B, T, vocab_size]

            # Apply softmax to get probabilities for the next token
            probs = torch.softmax(logits[:, -1, :] / T, dim=-1)  # Shape: [B, vocab_size]

            # Select the top_P tokens with the highest probabilities
            sorted_probs, sorted_indices = torch.sort(probs, descending=True, dim=-1)  # Shape: [B, top_k]
            cumulative_probs = torch.cumsum(sorted_probs, dim=-1)

            mask = cumulative_probs <= top_p
            mask[:, 0] = True
            cutoff = mask.sum(dim=-1)

            sampled_tokens = []
            for i in range(logits.size(0)):
                num_keep = cutoff[i].item()
                final_probs = sorted_probs[i, :num_keep]
                final_indices = sorted_indices[i, :num_keep]
                final_probs = final_probs / final_probs.sum()

                #Sample one token from the top_k candidates based on their probabilities
                idx = torch.multinomial(final_probs, num_samples=1, generator=sample_rng)
                sampled_token = final_indices[idx]
                sampled_tokens.append(sampled_token)

            ids = torch.stack(sampled_tokens)
            inputs = torch.cat((inputs, ids), dim=-1)  # Shape: [B, T+1]

    # Decode the generated sequences back into text
    generated_text = tokenizer.decode_batch(inputs.tolist())

    return generated_text

In [None]:
prompt = 'in last'
generated_texts = generate_top_p(model, tokenizer, prompt, n_rep=3, max_seq_len=256, T=0.9, top_p=0.9, device='cuda', seed=42)

print('Generate top_p:')
print()
for i, text in enumerate(generated_texts):
    display(HTML(f"<span style='color: yellow;'>Generated {i+1}:</span> <span style='color: cyan;'>{prompt}</span><span style='color: White;'>{text[len(prompt):]}</span>"))
    print('-'*150)

Generate top_p:



------------------------------------------------------------------------------------------------------------------------------------------------------


------------------------------------------------------------------------------------------------------------------------------------------------------


------------------------------------------------------------------------------------------------------------------------------------------------------
