# 🔴 **Import**

In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from IPython.display import HTML
import torch

# 🔴 **Init**

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

# 🔴 **Model & Tokenizer**

In [3]:
# model & tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
model = model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

# 🔴 Generate

### 🟡 `top-k` & `multinomial` `n_rep` & `End_Of_token`

In [4]:
def generate(model, tokenizer, prompt, n_rep=5, max_seq_len=128, T=0.9, top_k=10, device='cuda', seed=42):
  # Tokenize the prompt and convert it to a tensor on the specified device (e.g., GPU)

    inputs = torch.tensor(tokenizer.encode(prompt), dtype=torch.int, device=device)  # [T]

    # Repeat the input prompt n_rep times to generate multiple sequences in parallel
    inputs = inputs.unsqueeze(0).repeat(n_rep, 1)  # Shape: [B, T] where B = n_rep

    # Set the model to evaluation mode
    model.eval()

    # end_token_id = tokenizer.token_to_id('<|endoftext|>')
    end_token_id = tokenizer.encode('<|endoftext|>')[0]
    finished = torch.zeros(n_rep, dtype=torch.bool, device=device)  # [B] where B = n_rep

    # Initialize a random number generator for sampling
    sample_rng = torch.Generator(device=device)
    sample_rng.manual_seed(seed)

    # Disable gradient calculation for faster inference
    with torch.no_grad():
      # Continue generating tokens until reaching the maximum sequence length
      while inputs.shape[-1] < max_seq_len and not finished.all():
        # Forward pass: get logits from the model
        logits = model(inputs)  # Shape: [B, T, vocab_size]

        # Apply temperature scaling and softmax to get probabilities for the next token
        probs = torch.softmax(logits.logits[:, -1, :] / T, dim=-1)   # Shape: [B, vocab_size]

        # Select the top_k tokens with the highest probabilities
        topk_probs, topk_indices = torch.topk(probs, k=top_k, dim=-1)  # Shape: [B, top_k]

        # Sample one token from the top_k candidates based on their probabilities
        ids = torch.multinomial(topk_probs, 1, generator=sample_rng)  # Shape: [B, 1]

        # Map the sampled indices back to the original token IDs
        ids = torch.gather(topk_indices, -1, ids)  # Shape: [B, 1]

        # Update finished flags
        finished |= (ids.squeeze(1) == end_token_id)

        # For finished sequences, we append the end_token_id repeatedly to maintain shape
        ids[finished.unsqueeze(1)] = end_token_id

        # Append the sampled tokens to the input sequence
        inputs = torch.cat((inputs, ids), dim=-1)  # Shape: [B, T+1]

    # Cut off everything after the first occurrence of end_token_id
    final_outputs = []
    for sequence in inputs.tolist():
        if end_token_id in sequence[1:]:
          end_index = sequence[1:].index(end_token_id)
          final_outputs.append(sequence[:end_index+1])
        else:
            final_outputs.append(sequence)
    # Decode the generated sequences back into text
    generated_text = tokenizer.batch_decode(final_outputs)
    return generated_text

In [6]:
prompt = "In a distant galaxy"

generated_texts = generate(model, tokenizer, prompt, n_rep=3, max_seq_len=256, T=0.9, top_k=10, device='cuda', seed=42)

print('Generate top_k End_of_token:')
print()
for i, text in enumerate(generated_texts):
    display(HTML(f"<span style='color: yellow;'>Generated {i+1}:</span> <span style='color: cyan;'>{prompt}</span><span style='color: White;'>{text[len(prompt):]}</span>"))
    print('-'*170)

Generate top_k End_of_token:



--------------------------------------------------------------------------------------------------------------------------------------------------------------------------


--------------------------------------------------------------------------------------------------------------------------------------------------------------------------


--------------------------------------------------------------------------------------------------------------------------------------------------------------------------
