In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from pprint import pprint

  from .autonotebook import tqdm as notebook_tqdm


### 1. Auto-regressive generation

Please see more details in [https://huggingface.co/blog/how-to-generate](https://huggingface.co/blog/how-to-generate)

In short, auto-regressive language generation is based on the assumption that the probability distribution of a word sequence can be decomposed into the product of conditional next word distributions:
$$
p\left(w_{1: T} \mid P_0\right) = \prod_{t=1}^T p\left(w_t \mid w_{1: t-1}, P_0\right), \text{ with } w_{1: 0}=\emptyset,
$$
where $P_0$ being the initial context word sequence. The length $T$ of the word sequence is usually determined on-the-fly and corresponds to the timestep $t=T$ the EOS token is generated from $p\left(w_t \mid w_{1: t-1}, P_0\right)$. We will give a tour of the currently most prominent decoding methods, mainly Greedy search, Beam search, and Sampling.

In [None]:
import os
# Configure Hugging Face mirror endpoint (exporting via ! shell does not persist for Python)
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
print("HF_ENDPOINT set to:", os.environ.get("HF_ENDPOINT"))

HF_ENDPOINT set to: https://hf-mirror.com


In [None]:
# find out the device.
if torch.backends.mps.is_available():
    torch_device = "mps"          # Apple Silicon GPU (Metal)
elif torch.cuda.is_available():
    torch_device = "cuda"         # NVIDIA GPU
else:
    torch_device = "cpu"          # fallback

print(torch_device)

# Try to load tokenizer from remote (mirror) first, fallback to manual local construction if offline.
from transformers import AutoTokenizer, GPT2TokenizerFast
try:
    tokenizer = AutoTokenizer.from_pretrained("gpt2")
except OSError as e:
    print("Remote AutoTokenizer load failed (", e.__class__.__name__, ") -> attempting manual local construction...")
    local_base = "code-gpt2/code-gpt2"  # contains encoder.json (vocab) and vocab.bpe (merges)
    vocab_file = f"{local_base}/encoder.json"
    merges_file = f"{local_base}/vocab.bpe"
    try:
        tokenizer = GPT2TokenizerFast(vocab_file=vocab_file, merges_file=merges_file)
    except Exception as inner:
        raise RuntimeError(f"Local tokenizer construction failed: {inner}. Ensure files encoder.json and vocab.bpe exist.")
print("Tokenizer vocab size:", tokenizer.vocab_size)

In [None]:
model = AutoModelForCausalLM.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id).to(torch_device)
print(model)

#### 1.1 Greedy Search

Greedy search is the simplest decoding method. It selects the word with the highest probability as its next word: 
$$
w_t=\operatorname{argmax}_w p_\theta\left(w \mid w_{1: t-1}\right),
$$ 
at each timestep $t$.

In [None]:
# encode context the generation is conditioned on
model_inputs = tokenizer('I enjoy walking with my cute dog', return_tensors='pt').to(torch_device)
print(model_inputs)

The *model_inputs* contains 
- input_ids: the token ids after tokenization
- attention_mask: a binary tensor with the same length as input_ids, telling the model which tokens are real words and which are padding. During batch training or evaluation (DataLoader or collate_fn). In the case of single-sentence inference, or generation of LLMs, there is no need to padding (meaning all attention masks are 1).

In [None]:
# encode context the generation is conditioned on
model_inputs = tokenizer('I enjoy walking with my cute dog', return_tensors='pt').to(torch_device)
# generate 200 new tokens as the result of the inference
greedy_output = model.generate(**model_inputs, max_new_tokens=200)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))


#### 1.2 Beam search



In [None]:
# activate beam search and early_stopping
beam_output = model.generate(**model_inputs, max_new_tokens=200, num_beams=5, early_stopping=True)
print("Output:\n" + 100 * '-')
print(tokenizer.decode(beam_output[0], skip_special_tokens=True))

#### 1.3 Beam search with N-gram

In [None]:
# set no_repeat_ngram_size to 2
beam_output = model.generate(
    **model_inputs,
    max_new_tokens=200,
    num_beams=5,
    no_repeat_ngram_size=2,
    early_stopping=True
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(beam_output[0], skip_special_tokens=True))


In [None]:
# set seed to reproduce results. Feel free to change the seed though to get different results
from transformers import set_seed
set_seed(42)

# activate sampling and deactivate top_k by setting top_k sampling to 0
sample_output = model.generate(
    **model_inputs,
    max_new_tokens=200,
    do_sample=True,
    top_k=0
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))


In [None]:
# set seed to reproduce results. Feel free to change the seed though to get different results
set_seed(42)

# use temperature to decrease the sensitivity to low probability candidates
sample_output = model.generate(
    **model_inputs,
    max_new_tokens=200,
    do_sample=True,
    top_k=0,
    temperature=0.6,
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))


In [None]:
# set seed to reproduce results. Feel free to change the seed though to get different results
set_seed(42)

# set top_k to 50
sample_output = model.generate(
    **model_inputs,
    max_new_tokens=200,
    do_sample=True,
    top_k=50
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))


#### 1.4 With Tempature


In [None]:
# set seed to reproduce results. Feel free to change the seed though to get different results
set_seed(42) #设定了随机数种子，别人就可以复现；否则每次的随机数是不一样的

# set top_k to 50
sample_output = model.generate(
    **model_inputs,
    max_new_tokens=200,
    do_sample=True,
    top_p=0.92, #按概率从高到低排序，取累计概率刚超过阈值 p 的最小子集合（动态集合大小）。只在该集合内按原始（或温度调整后）分布随机抽样
    top_k=50  #每一步采样只在概率最高的 k 个候选 token 中进行
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))


In [None]:
# set seed to reproduce results. Feel free to change the seed though to get different results
set_seed(42)

# set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
sample_outputs = model.generate(
    **model_inputs,
    max_new_tokens=40,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    num_return_sequences=3,
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))


### 2. GPT-1

#### 2.1 Sampling from GPT-1

In [None]:
from transformers import pipeline, set_seed

In [None]:
generator = pipeline('text-generation', model='openai-gpt')
set_seed(42)
generator("Hello, I'm a language model,", max_length=30, num_return_sequences=5)


In [None]:
from transformers import OpenAIGPTTokenizer, OpenAIGPTModel
import torch

tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")
model = OpenAIGPTModel.from_pretrained("openai-gpt")

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state
print(last_hidden_states[0][0])

In [None]:
from dataclasses import dataclass
from typing import Dict, Any

import numpy as np
import torch
from datasets import load_dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments,)
import evaluate


MODEL_NAME = "openai-gpt"
MAX_LENGTH = 256

#### 2.2 Fine-tuning


[https://github.com/huggingface/pytorch-openai-transformer-lm](https://github.com/huggingface/pytorch-openai-transformer-lm)