# GPT代码解析

In [5]:
import os
import time
import math
import pickle
from contextlib import nullcontext
import numpy as np
import torch
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group
from model import GPTConfig, GPT
from torch.nn import functional as F

## 1. config
训练大型模型的配置文件通常包含了一系列详细的参数和设置，用于指导训练过程。以下是一些常见的内容：

1. **模型参数**：
   - 模型架构（例如，ResNet、Transformer等）
   - 层数、隐藏单元数、过滤器大小等
   - 激活函数类型
   - 是否使用正则化技术（如Dropout、Batch Normalization等）

2. **数据预处理**：
   - 数据集路径
   - 数据增强技术（如旋转、缩放、裁剪等）
   - 输入数据的预处理步骤（归一化、标准化等）
   - 输出数据的编码方式（类别标签、独热编码等）

3. **优化器和学习率**：
   - 优化器类型（如Adam、SGD等）
   - 初始学习率
   - 学习率衰减策略（如余弦退火、指数衰减等）
   - 权重初始化方法

4. **训练设置**：
   - 批量大小（Batch Size）
   - 训练轮数（Epochs）
   - 每个epoch的迭代次数
   - 验证集和测试集的划分

5. **损失函数**：
   - 损失函数类型（如交叉熵、均方误差等）
   - 损失函数的参数

6. **评估指标**：
   - 评估模型性能的指标（如准确率、F1分数、ROC曲线等）
   - 评估频率

7. **硬件设置**：
   - 使用的GPU或CPU数量
   - 内存限制
   - 训练过程中的资源监控

8. **日志和模型保存**：
   - 日志文件的保存路径
   - 模型检查点的保存频率
   - 最终模型的保存路径

9. **分布式训练设置**（如果适用）：
   - 分布式训练的策略（如数据并行、模型并行等）
   - 通信协议和后端设置

10. **超参数调整**（如果使用自动化工具）：
    - 超参数搜索空间
    - 搜索策略（如网格搜索、随机搜索、贝叶斯优化等）

配置文件的具体内容会根据所使用的框架（如TensorFlow、PyTorch等）、模型类型以及任务需求而有所不同。通常，配置文件的目的是为了确保实验的可重复性，同时方便地进行不同设置的实验。

In [8]:
# ===================================日志与模型保存=====================================
out_dir = 'out-test'
eval_interval = 500
log_interval = 4
eval_iters = 10
always_save_checkpoint = False  # if True, always save a checkpoint after each eval
init_from = 'scratch'  # 'scratch' or 'resume' or 'gpt2*'
# wandb logging
wandb_log = False  # disabled by default
wandb_project = 'shakespeare-char'
wandb_run_name = 'mini-gpt'  # 'run' + str(time.time())

# ===================================数据预处理===================================
dataset = 'shakespeare_char'
gradient_accumulation_steps = 4  # used to simulate larger batch sizes
batch_size = 16  # if gradient_accumulation_steps > 1, this is the micro-batch size
block_size = 256  # text

# ===================================模型参数=====================================
n_layer = 12
n_head = 12
n_embd = 768
dropout = 0.1  # for pretraining 0 is good, for finetuning try 0.1+
bias = False  # do we use bias inside LayerNorm and Linear layers?

# ===================================优化器与学习率=====================================
# adamw optimizer
learning_rate = 1e-3  # max learning rate
max_iters = 5000  # total number of training iterations
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.99
grad_clip = 1.0  # clip gradients at this value, or disable if == 0.0
# learning rate decay settings
decay_lr = True  # whether to decay the learning rate
warmup_iters = 100  # how many steps to warm up for
lr_decay_iters = 5000  # should be ~= max_iters per Chinchilla
min_lr = 1e-4  # minimum learning rate, should be ~= learning_rate/10 per Chinchilla

# ===================================分布式训练设置=====================================
# DDP settings
backend = 'nccl'  # 'nccl', 'gloo', etc.

# ===================================硬件设置=====================================
# system
# device = 'cuda'  # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
dtype = 'bfloat16' if torch.cuda.is_available(
) and torch.cuda.is_bf16_supported() else 'float16'
compile = True  # use PyTorch 2.0 to compile the model to be faster
# -----------------------------------------------------------------------------
config_keys = [k for k, v in globals().items() if not k.startswith(
    '_') and isinstance(v, (int, float, bool, str))]
# exec(open('configurator.py').read()) # overrides from command line or config file
config = {k: globals()[k] for k in config_keys}  # will be useful for logging

In [9]:
seed_offset = 0
os.makedirs(out_dir, exist_ok=True)
torch.manual_seed(1337 + seed_offset)
torch.backends.cuda.matmul.allow_tf32 = True  # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True  # allow tf32 on cudnn
# device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
device_type = device
ptdtype = {'float32': torch.float32,
           'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(
    device_type=device_type, dtype=ptdtype)

In [10]:
data_dir = os.path.join('data', dataset)
data_dir

'data/shakespeare_char'

In [11]:
meta_path = os.path.join(data_dir, 'meta.pkl')
meta_vocab_size = None
if os.path.exists(meta_path):
    with open(meta_path, 'rb') as f:
        meta = pickle.load(f)
    meta_vocab_size = meta['vocab_size']
    print(f"found vocab_size = {meta_vocab_size} (inside {meta_path})")

found vocab_size = 65 (inside data/shakespeare_char/meta.pkl)


In [12]:
model_args = dict(n_layer=n_layer, n_head=n_head, n_embd=n_embd, block_size=1024,
                  bias=bias, vocab_size=None, dropout=dropout)  # start with model_args from command line
if meta_vocab_size is None:
    print("defaulting to vocab_size of GPT-2 to 50304 (50257 rounded up for efficiency)")
model_args['vocab_size'] = meta_vocab_size if meta_vocab_size is not None else 50304
gptconf = GPTConfig(**model_args)

## 2. 加载模型

In [13]:
model = GPT(gptconf).to('cpu')
print(model)

number of parameters: 85.00M
GPT(
  (transformer): ModuleDict(
    (wte): Embedding(65, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=False)
          (c_proj): Linear(in_features=768, out_features=768, bias=False)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=False)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=3072, out_features=768, bias=False)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=768, out_features=65, bias=False)
)


## 3. 加载数据

In [6]:
X = torch.load('X.tensor').to('cpu')
Y = torch.load('Y.tensor').to('cpu')
print('X:', X.shape)
print('Y:', Y.shape)
print('X[0,:10]: ', X[0, :10])
print('Y[0,:10]: ', Y[0, :10])

X: torch.Size([16, 256])
Y: torch.Size([16, 256])
X[0,:10]:  tensor([ 1, 40, 43,  0, 42, 39, 51, 52, 43, 42])
Y[0,:10]:  tensor([40, 43,  0, 42, 39, 51, 52, 43, 42,  1])


In [19]:
# class GPTConfig:
batch_size = 16
block_size = 1024  # lenght
n_layer = 2
n_head = 4
n_embd = 128

## 4. GPT前向传播

In [21]:
# class GPT(nn.Module).forward
device = 'cpu'
b, t = X.size()  # b是批量大小，t是序列长度
print("t:{} = block_size:{}".format(t, block_size))

# 生成一个从0到t的整数序列，用于位置编码
pos = torch.arange(0, t, dtype=torch.long, device=device)
# forward the GPT model itself
tok_emb = model.transformer.wte(X)  # token embeddings of shape (b, t, n_embd)
# position embeddings of shape (t, n_embd)
pos_emb = model.transformer.wpe(pos)
print('-----embding-input-------')
print('词嵌入向量维n_embd = ', n_embd)
print('tok_emb:', tok_emb.shape)
print('pos_emb:', pos_emb.shape)
print('tok_emb+pos_emb:', (tok_emb + pos_emb).shape)
x = model.transformer.drop(tok_emb + pos_emb)  # pretraining
x_enc = model.transformer.drop(tok_emb + pos_emb)
print('编码后embding input:', x.shape)
print('-----decoder-block-------')
print('n_layer:', n_layer)
print('decoder layers:', len(model.transformer.h))
i = 0
for block in model.transformer.h:
    print(i)
    i += 1
    x = block(x)
    print('decoder x:', x.shape)
x = model.transformer.ln_f(x)  # LayerNorm
print('ln_f x:', x.shape)
print('-----lm_head-------')
logits = model.lm_head(x)  # 得到输出logits
print('lm_head :', logits.shape)
print('lm_head输出与解码词汇量相同, meta_vocab_size=', meta_vocab_size)
print('-----loss-------')
loss = F.cross_entropy(logits.view(-1, logits.size(-1)),
                       Y.view(-1), 
                       ignore_index=-1)
print(loss)

t:256 = block_size:1024
-----embding-input-------
词嵌入向量维n_embd =  128
tok_emb: torch.Size([16, 256, 768])
pos_emb: torch.Size([256, 768])
tok_emb+pos_emb: torch.Size([16, 256, 768])
编码后embding input: torch.Size([16, 256, 768])
-----decoder-block-------
n_layer: 2
decoder layers: 12
0
decoder x: torch.Size([16, 256, 768])
1
decoder x: torch.Size([16, 256, 768])
2
decoder x: torch.Size([16, 256, 768])
3
decoder x: torch.Size([16, 256, 768])
4
decoder x: torch.Size([16, 256, 768])
5
decoder x: torch.Size([16, 256, 768])
6
decoder x: torch.Size([16, 256, 768])
7
decoder x: torch.Size([16, 256, 768])
8
decoder x: torch.Size([16, 256, 768])
9
decoder x: torch.Size([16, 256, 768])
10
decoder x: torch.Size([16, 256, 768])
11
decoder x: torch.Size([16, 256, 768])
ln_f x: torch.Size([16, 256, 768])
-----lm_head-------
lm_head : torch.Size([16, 256, 65])
lm_head输出与解码词汇量相同, meta_vocab_size= 65
-----loss-------
tensor(4.4368, grad_fn=<NllLossBackward0>)


In [23]:
# h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
# class Block(nn.Module)
print('-------------decoder block-------------')
decoder_block = model.transformer.h[0]  # 从模型的解码器层中获取第一个解码器块
print(decoder_block)

# 根据上面打印的第一个解码器块的信息，分解解码器块的计算过程如下
x = x_enc
x_ln_1 = decoder_block.ln_1(x)
x_attn = decoder_block.attn(x_ln_1)
x_ln_2 = decoder_block.ln_2(x_attn)
x_mlp = decoder_block.mlp(x_ln_2)

print('layer norm :', x_ln_1.shape)
print('masked_self_attention :', x_attn.shape)
print('layer norm :', x_ln_2.shape)
print('mlp :', x_mlp.shape)

# 将x通过解码器块的第一个层归一化层和自注意力层，然后与原始的x相加。这是残差连接的一部分，可以帮助模型学习恒等函数，从而更深入地学习
x = x + decoder_block.attn(decoder_block.ln_1(x))
# 将x通过解码器块的第二个层归一化层和多层感知机层，然后与原始的x相加。这是残差连接的另一部分
x = x + decoder_block.mlp(decoder_block.ln_2(x))

-------------decoder block-------------
Block(
  (ln_1): LayerNorm()
  (attn): CausalSelfAttention(
    (c_attn): Linear(in_features=768, out_features=2304, bias=False)
    (c_proj): Linear(in_features=768, out_features=768, bias=False)
    (attn_dropout): Dropout(p=0.1, inplace=False)
    (resid_dropout): Dropout(p=0.1, inplace=False)
  )
  (ln_2): LayerNorm()
  (mlp): MLP(
    (c_fc): Linear(in_features=768, out_features=3072, bias=False)
    (gelu): GELU(approximate='none')
    (c_proj): Linear(in_features=3072, out_features=768, bias=False)
    (dropout): Dropout(p=0.1, inplace=False)
  )
)
layer norm : torch.Size([16, 256, 768])
masked_self_attention : torch.Size([16, 256, 768])
layer norm : torch.Size([16, 256, 768])
mlp : torch.Size([16, 256, 768])


In [24]:
# Masked Self Attention
# 查看模型中attention的计算过程
attention = model.transformer.h[0].attn
print(attention)
print("如果torch>2.0.0, 是否可直接使用scaled_dot_product_attention：", attention.flash)

x = x_ln_1
B, T, C = x.size()
# batch:16, block:256, embed:768
print("batch:{}, block:{}, embed:{}, ".format(B, T, C))


# self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
print('---------------1. 将嵌入向量传播成3*n_embd--------------')
x_liner = attention.c_attn(x)  # 将x通过自注意力层的线性层，得到3倍嵌入维度的输出
print("n_embed:", attention.n_embd)
print("n_embed*3:", attention.n_embd*3)  # 3份分别是QKV
print("x_liner:", x_liner.shape)

print('---------------2. 将3*n_embd split成QKV--------------')
q, k, v = x_liner.split(attention.n_embd, dim=2)
print("split: q:", q.shape)

print('---------------3. 将QKV拆分 多头QKV--------------')
print("n_embed:{} / n_head:{} = {} ".format(C, attention.n_head, C//attention.n_head))
k = k.view(B, T, attention.n_head, C //
           attention.n_head).transpose(1, 2)  # (B, nh, T, hs)
q = q.view(B, T, attention.n_head, C //
           attention.n_head).transpose(1, 2)  # (B, nh, T, hs)
v = v.view(B, T, attention.n_head, C //
           attention.n_head).transpose(1, 2)  # (B, nh, T, hs)
print("q:", q.shape)
print("Q = batch:{}, n_head:{}, block:{}, head_embed:{} ".format(B, n_head, T, C//attention.n_head))


print('---------------4.多头计算attention，直接使用torch function--------------')
# 计算缩放点积注意力
y = torch.nn.functional.scaled_dot_product_attention(q, k, v, 
                                                     attn_mask=None,
                                                     dropout_p=attention.dropout if attention.training else 0, 
                                                     is_causal=True)
print('y:', y.shape)


print('---------------5.将多头注意力结果拼接--------------')
# re-assemble all head outputs side by side
y = y.transpose(1, 2).contiguous().view(B, T, C)  # transpose
print('y-concat:', y.shape)


print('---------------6. 增加一次前向传播--------------')
# 将重塑后的输出通过自注意力层的另一个线性层，并应用残差dropout
y = attention.resid_dropout(attention.c_proj(y))
print("y_proj:", y.shape)

CausalSelfAttention(
  (c_attn): Linear(in_features=768, out_features=2304, bias=False)
  (c_proj): Linear(in_features=768, out_features=768, bias=False)
  (attn_dropout): Dropout(p=0.1, inplace=False)
  (resid_dropout): Dropout(p=0.1, inplace=False)
)
如果torch>2.0.0, 是否可直接使用scaled_dot_product_attention： True
batch:16, block:256, embed:768, 
---------------1. 将嵌入向量传播成3*n_embd--------------
n_embed: 768
n_embed*3: 2304
x_liner: torch.Size([16, 256, 2304])
---------------2. 将3*n_embd split成QKV--------------
split: q: torch.Size([16, 256, 768])
---------------3. 将QKV拆分 多头QKV--------------
n_embed:768 / n_head:12 = 64 
q: torch.Size([16, 12, 256, 64])
Q = batch:16, n_head:4, block:256, head_embed:64 
---------------4.多头计算attention，直接使用torch function--------------
y: torch.Size([16, 12, 256, 64])
---------------5.将多头注意力结果拼接--------------
y-concat: torch.Size([16, 256, 768])
---------------6. 增加一次前向传播--------------
y_proj: torch.Size([16, 256, 768])


In [25]:
masked_matrix = torch.tril(torch.ones(T, T)).view(1, 1, T, T)
print(masked_matrix[:10, :10])

tensor([[[[1., 0., 0.,  ..., 0., 0., 0.],
          [1., 1., 0.,  ..., 0., 0., 0.],
          [1., 1., 1.,  ..., 0., 0., 0.],
          ...,
          [1., 1., 1.,  ..., 1., 0., 0.],
          [1., 1., 1.,  ..., 1., 1., 0.],
          [1., 1., 1.,  ..., 1., 1., 1.]]]])


In [26]:
# attention.register_buffer("bias", torch.tril(torch.ones(block_size, block_size))
#                                         .view(1, 1, block_size, block_size))
# print(bias)
# 不使用torch2.0 attention计算
print('5行代码实现多头注意力计算')
# 1. scale and dot product process
att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
print("q score:", q.shape)
print("k score:", k.shape)
print("k_t score:", k.transpose(-2, -1).shape)
print("q @ k_t score:", (q@k.transpose(-2, -1)).shape)
print("attn score:", att.shape)
# 2. Mask为下三角矩阵
att = att.masked_fill(masked_matrix == 0, float('-inf'))
# 3. softmax
att = F.softmax(att, dim=-1)
# 4. attn
att = attention.attn_dropout(att)
# 5. score
y = att @ v  # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
print("attn score:", y.shape)

5行代码实现多头注意力计算
q score: torch.Size([16, 12, 256, 64])
k score: torch.Size([16, 12, 256, 64])
k_t score: torch.Size([16, 12, 64, 256])
q @ k_t score: torch.Size([16, 12, 256, 256])
attn score: torch.Size([16, 12, 256, 256])
attn score: torch.Size([16, 12, 256, 64])


In [27]:
print(masked_matrix)

tensor([[[[1., 0., 0.,  ..., 0., 0., 0.],
          [1., 1., 0.,  ..., 0., 0., 0.],
          [1., 1., 1.,  ..., 0., 0., 0.],
          ...,
          [1., 1., 1.,  ..., 1., 0., 0.],
          [1., 1., 1.,  ..., 1., 1., 0.],
          [1., 1., 1.,  ..., 1., 1., 1.]]]])


In [28]:
# mlp实现
mlp = model.transformer.h[0].mlp
print(mlp)
print("x:", x.shape)
x = mlp.c_fc(x)
print("x_fc:", x.shape)
x = mlp.gelu(x)
x = mlp.c_proj(x)
print("x_proj:", x.shape)
x = mlp.dropout(x)

MLP(
  (c_fc): Linear(in_features=768, out_features=3072, bias=False)
  (gelu): GELU(approximate='none')
  (c_proj): Linear(in_features=3072, out_features=768, bias=False)
  (dropout): Dropout(p=0.1, inplace=False)
)
x: torch.Size([16, 256, 768])
x_fc: torch.Size([16, 256, 3072])
x_proj: torch.Size([16, 256, 768])


In [30]:
# layer normalization
ln = model.transformer.h[0].ln_1
print(x.shape)
F.layer_norm(x, ln.weight.shape, ln.weight, ln.bias, 1e-5)
print(x.shape)

torch.Size([16, 256, 768])
torch.Size([16, 256, 768])


In [31]:
Mask = torch.tril(torch.ones(5, 5)).view(1, 1, 5, 5)
print(Mask)

tensor([[[[1., 0., 0., 0., 0.],
          [1., 1., 0., 0., 0.],
          [1., 1., 1., 0., 0.],
          [1., 1., 1., 1., 0.],
          [1., 1., 1., 1., 1.]]]])


## 5. BPE

In [32]:
from bpe import Encoder
import sys
sys.path.append('.')  # 将当前目录添加到Python的模块搜索路径。这样做是为了能够导入当前目录下的模块


test_corpus = '''
    Object raspberrypi functools dict kwargs. Gevent raspberrypi functools. Dunder raspberrypi decorator dict didn't lambda zip import pyramid, she lambda iterate?
    Kwargs raspberrypi diversity unit object gevent. Import fall integration decorator unit django yield functools twisted. Dunder integration decorator he she future. Python raspberrypi community pypy. Kwargs integration beautiful test reduce gil python closure. Gevent he integration generator fall test kwargs raise didn't visor he itertools...
    Reduce integration coroutine bdfl he python. Cython didn't integration while beautiful list python didn't nit!
    Object fall diversity 2to3 dunder script. Python fall for: integration exception dict kwargs dunder pycon. Import raspberrypi beautiful test import six web. Future integration mercurial self script web. Return raspberrypi community test she stable.
    Django raspberrypi mercurial unit import yield raspberrypi visual rocksdahouse. Dunder raspberrypi mercurial list reduce class test scipy helmet zip?
'''

# params chosen for demonstration purposes
# 词汇表大小为200，其中88%（即176个）的词汇将由BPE生成，剩下的12%（即24个）的词汇将是单个字符。
encoder = Encoder(200, pct_bpe=0.88)
encoder.fit(test_corpus.split('\n'))

example = "Vizzini: He didn't fall? INCONCEIVABLE!"
print(encoder.tokenize(example)) 
print(next(encoder.transform([example])))
print(next(encoder.inverse_transform(encoder.transform([example]))))

['__sow', 'vi', 'z', 'zi', 'ni', '__eow', '__sow', ':', '__eow', 'he', 'didn', "'", 't', 'fall', '__sow', '?', '__eow', '__sow', 'in', 'co', 'n', 'ce', 'iv', 'ab', 'le', '__eow', '__sow', '!', '__eow']
[24, 108, 82, 83, 71, 25, 24, 154, 25, 14, 10, 11, 12, 13, 24, 85, 25, 24, 140, 59, 39, 157, 87, 165, 114, 25, 24, 148, 25]
vizzini : he didn ' t fall ? inconceivable !


In [1]:
from bpe import Encoder
import sys
sys.path.append('.')


test_corpus = '''
    old older finest finer best 
'''

encoder = Encoder(30, pct_bpe=0.88)  # params chosen for demonstration purposes
encoder.fit(test_corpus.split('\n'))
print(encoder.bpe_vocab)

example = "oldest"
print(encoder.tokenize(example))
print(next(encoder.transform([example])))
print(next(encoder.inverse_transform(encoder.transform([example]))))

{'__eow': 3, '__sow': 4, 'e': 5, 'r': 6, 'er': 7, 'f': 8, 'i': 9, 'n': 10, 's': 11, 't': 12, 'fi': 13, 'in': 14, 'ne': 15, 'es': 16, 'st': 17, 'o': 18, 'l': 19, 'd': 20, 'ol': 21, 'ld': 22, 'de': 23, 'b': 24, 'be': 25}
['__sow', 'ol', 'de', 'st', '__eow']
[4, 21, 23, 17, 3]
oldest


In [None]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
print(tokenizer)

idx = tokenizer("pneumonoultramicroscopicsilicovolcanoconiosis")["input_ids"]
print(idx)

## 6. GPT生成

```python
# NanoGPT
def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        """
        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
        the sequence max_new_tokens times, feeding the predictions back into the model each time.
        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
        """
        for _ in range(max_new_tokens):
            # if the sequence context is growing too long we must crop it at block_size
            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
            # forward the model to get the logits for the index in the sequence
            logits, _ = self(idx_cond)
            # pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :] / temperature
            # optionally crop the logits to only the top k options
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')
            # apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1)

        return idx
```

In [16]:
# Greedy Generation
# 词表大小为65，每次预测都是65个概率值，表示下一个token的概率，取概率最大的token作为下一个token
idx = X[1, :10].reshape(1, 10)  # generate 1
print("预测词表大小", model.config.vocab_size)  # BPE词表
print("输入X:", idx)
print("输入X长度:", idx.shape)
for _ in range(2):
    logits, _ = model(idx)
    print('输出Logits:', logits.shape)

    probs = F.softmax(logits, dim=-1)
    print('输出Probs:', probs.shape)
    print(probs)

    idx_next = torch.argmax(probs, dim=2)  # dim=2表示在第2维度上取最大值
    print('输出下一个Token:', idx_next.shape)
    print(idx_next)

    idx = torch.cat((idx, idx_next), dim=1)  # 将下一个Token拼接到输入序列上
    print("当前的token长度:", len(idx[0]))
    print("当前的token序列:", idx)

预测词表大小 65
输入X: tensor([[ 0, 13, 52, 42,  1, 26, 43, 56, 53,  1]])
输入X长度: torch.Size([1, 10])
输出Logits: torch.Size([1, 1, 65])
输出Probs: torch.Size([1, 1, 65])
tensor([[[0.0056, 0.0283, 0.0148, 0.0266, 0.0032, 0.0164, 0.0146, 0.0121,
          0.0085, 0.0279, 0.0116, 0.0108, 0.0156, 0.0094, 0.0293, 0.0083,
          0.0114, 0.0336, 0.0144, 0.0314, 0.0182, 0.0098, 0.0190, 0.0178,
          0.0196, 0.0120, 0.0126, 0.0138, 0.0113, 0.0144, 0.0189, 0.0072,
          0.0262, 0.0077, 0.0210, 0.0154, 0.0101, 0.0126, 0.0108, 0.0270,
          0.0085, 0.0257, 0.0118, 0.0273, 0.0121, 0.0076, 0.0039, 0.0077,
          0.0083, 0.0162, 0.0126, 0.0117, 0.0082, 0.0138, 0.0120, 0.0077,
          0.0071, 0.0269, 0.0142, 0.0303, 0.0162, 0.0225, 0.0162, 0.0254,
          0.0068]]], grad_fn=<SoftmaxBackward0>)
输出下一个Token: torch.Size([1, 1])
tensor([[17]])
当前的token长度: 11
当前的token序列: tensor([[ 0, 13, 52, 42,  1, 26, 43, 56, 53,  1, 17]])
输出Logits: torch.Size([1, 1, 65])
输出Probs: torch.Size([1, 1, 65])
tensor([

In [17]:
torch.set_printoptions(precision=4)  # 设置打印精度
torch.set_printoptions(sci_mode=False)  # 设置打印模式为非科学计数法
torch.set_printoptions(linewidth=100)  # 设置打印行宽

In [18]:
# model.generate()
# top-k GPT-2
idx = X[1, :10].reshape(1, 10)  # generate 1
print(idx.shape)
temperature = 1.0
top_k = 5
print(model.config.vocab_size)  # BPE词表
print("prompt:", idx)

# top k的方法
# 词表 65
# llama 32000

for _ in range(10):
    idx_cond = idx if idx.size(
        1) <= model.config.block_size else idx[:, -model.config.block_size:]
    print('idx_cond:', idx_cond.shape)

    logits, _ = model(idx_cond)
    print('logits:', logits.shape)

    print("no - 1", logits[:, -1, :].shape)  # 为什么要-1来降维

    logits = logits[:, -1, :] / temperature  # 平缓， tips : 知识蒸馏[温度]
    print('logits:', logits.shape)

    if top_k is not None:
        v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
        print('top_k v:', v.shape)

#         logits[logits < v[:, [-1]]] = -float('Inf')
        print('top_k logits:', logits.shape)
        print(logits)

    probs = F.softmax(logits, dim=-1)
    print('probs :', probs.shape)
    print(probs)

    # num_samples-by-num_samples
    idx_next = torch.multinomial(probs, num_samples=1)  # 从多项分布中采样
    print('idx_next :', idx_next.shape)
    print(idx_next)

    idx = torch.cat((idx, idx_next), dim=1)
    print(idx.shape)

    print("generate: length :", len(idx[0]))
    print("generate:", idx)

torch.Size([1, 10])
65
prompt: tensor([[ 0, 13, 52, 42,  1, 26, 43, 56, 53,  1]])
idx_cond: torch.Size([1, 10])
logits: torch.Size([1, 1, 65])
no - 1 torch.Size([1, 65])
logits: torch.Size([1, 65])
top_k v: torch.Size([1, 5])
top_k logits: torch.Size([1, 65])
tensor([[-0.8807,  0.3733,  0.1661,  0.6737, -1.1515, -0.0462,  0.3196, -0.7420, -0.7176,  0.6705,
         -0.4549, -0.0960, -0.6452, -0.6828, -0.4365, -0.0968, -0.8049,  0.8895, -0.1366,  0.1941,
          1.0271,  0.0642,  0.5817,  0.4299,  0.1736,  0.0942,  0.0415, -0.1390, -0.1196,  0.0969,
         -0.0389, -0.1811,  0.4824, -1.5111,  0.6647,  0.2604, -0.6491, -0.2169, -0.8141,  0.1885,
         -0.4787,  0.3125, -0.2149,  0.7776,  0.0711, -0.2495, -0.4972, -0.6088, -0.4011, -0.0208,
         -0.2463, -0.1300, -1.1869, -0.6447, -0.4758, -0.7341, -0.0147,  0.7913, -0.7784,  0.0524,
          0.4630,  0.3149,  0.0990,  0.1935, -0.3702]], grad_fn=<DivBackward0>)
probs : torch.Size([1, 65])
tensor([[0.0062, 0.0217, 0.0177, 0.029

In [None]:
# model.generate(idx, 20, 1.0, None) # temparature 1.0 ,  Top_K None

In [19]:
# sampling
dict_map = {0: "小", 1: "冬", 2: "瓜"}
prob = torch.tensor([0.3, 0.3, 0.0])
print("prob : ", prob)

print("根据概率权重所选择next token的下标")
for i in range(10):
    next_token = torch.multinomial(prob, num_samples=1)[0]
    print(f"第{i}次sample的next_token下标为{next_token}:",
          dict_map[int(next_token)])

prob :  tensor([0.3000, 0.3000, 0.0000])
根据概率权重所选择next token的下标
第0次sample的next_token下标为1: 冬
第1次sample的next_token下标为0: 小
第2次sample的next_token下标为0: 小
第3次sample的next_token下标为1: 冬
第4次sample的next_token下标为0: 小
第5次sample的next_token下标为0: 小
第6次sample的next_token下标为0: 小
第7次sample的next_token下标为1: 冬
第8次sample的next_token下标为1: 冬
第9次sample的next_token下标为0: 小


In [20]:
# Do Sample with temparature
temparature = 2.0
dict_map = {0: "小", 1: "冬", 2: "瓜"}
prob = torch.tensor([0.7, 0.2, 0.1])
print("prob : ", prob)
prob /= temparature
print("prob/T : ", prob)
prob = F.softmax(prob)
print("softmax(prob/T) : ", prob)

print("根据概率权重所选择next token的下标")
for i in range(10):
    next_token = torch.multinomial(prob, num_samples=1)[0]
    print(f"第{i}次sample的next_token下标为{next_token}:",
          dict_map[int(next_token)])

prob :  tensor([0.7000, 0.2000, 0.1000])
prob/T :  tensor([0.3500, 0.1000, 0.0500])
softmax(prob/T) :  tensor([0.3969, 0.3091, 0.2940])
根据概率权重所选择next token的下标
第0次sample的next_token下标为1: 冬
第1次sample的next_token下标为0: 小
第2次sample的next_token下标为1: 冬
第3次sample的next_token下标为0: 小
第4次sample的next_token下标为1: 冬
第5次sample的next_token下标为0: 小
第6次sample的next_token下标为0: 小
第7次sample的next_token下标为1: 冬
第8次sample的next_token下标为1: 冬
第9次sample的next_token下标为1: 冬


  prob = F.softmax(prob)


In [21]:
# top-k
temparature = 2.0
top_k = 2
dict_map = {0: "小", 1: "冬", 2: "瓜"}
prob = torch.tensor([0.7, 0.2, 0.1])
print("prob : ", prob)
prob /= temparature
print("prob/T : ", prob)
prob = F.softmax(prob)
print("softmax(prob/T) : ", prob)
prob, _ = torch.topk(prob, top_k)
print("top-k:", prob)
prob = F.softmax(prob)
print("top-k softmax:", prob)
print("根据概率权重所选择next token的下标")
for i in range(10):
    next_token = torch.multinomial(prob, num_samples=1)[0]
    print(f"第{i}次sample的next_token下标为{next_token}:",
          dict_map[int(next_token)])

prob :  tensor([0.7000, 0.2000, 0.1000])
prob/T :  tensor([0.3500, 0.1000, 0.0500])
softmax(prob/T) :  tensor([0.3969, 0.3091, 0.2940])
top-k: tensor([0.3969, 0.3091])
top-k softmax: tensor([0.5219, 0.4781])
根据概率权重所选择next token的下标
第0次sample的next_token下标为0: 小
第1次sample的next_token下标为0: 小
第2次sample的next_token下标为0: 小
第3次sample的next_token下标为1: 冬
第4次sample的next_token下标为0: 小
第5次sample的next_token下标为1: 冬
第6次sample的next_token下标为1: 冬
第7次sample的next_token下标为1: 冬
第8次sample的next_token下标为0: 小
第9次sample的next_token下标为0: 小


  prob = F.softmax(prob)
  prob = F.softmax(prob)


In [22]:
# repetition penalty
# GPT-2模型进行文本生成的过程，并在生成过程中应用了重复性惩罚
idx = X[1, :10].reshape(1, 10)
print(idx)
penalty = 2.0  # 重复性惩罚的权重
for _ in range(256):
    logits, _ = model(idx)
    logits = logits[:, -1, :]
    original_logits = logits.clone()

    print(logits)

    # repetition penalty
    logits_idx = torch.gather(logits, 1, idx)
    logits_idx = torch.where(logits < 0, logits *
                             penalty, logits / penalty).clone()
    logits.scatter_(1, idx, logits_idx)
    print(logits)

    probs = F.softmax(logits, dim=-1)

    idx_next = torch.multinomial(probs, num_samples=1)
    idx = torch.cat((idx, idx_next), dim=1)

    break

tensor([[ 0, 13, 52, 42,  1, 26, 43, 56, 53,  1]])
tensor([[-1.1206,  0.7769,  0.2215,  0.3090, -1.6671,  0.4123,  0.1556,  0.4466,  0.1015,  0.9674,
         -0.3920, -0.3790, -0.5708, -0.6095,  0.0887, -0.6818, -1.1605,  0.5678, -0.3892,  0.6791,
          0.1930, -0.0099,  0.7034,  0.4553,  0.0086,  0.1226, -0.3700,  0.1384, -0.2160,  0.0735,
          0.6738,  0.3342,  0.1935, -1.0596,  0.3474,  0.3951, -0.4422,  0.2010, -0.2527,  0.5982,
          0.5057,  0.0167, -0.4491,  0.5445,  0.7778, -0.1056, -1.1756, -0.5428, -0.7051,  0.1678,
          0.0762, -0.2580, -1.0917,  0.3266, -0.2051, -0.0857, -1.0482,  0.7362, -0.1279,  0.3829,
          0.5765,  0.1140,  0.2138,  0.6706, -0.5340]], grad_fn=<SliceBackward0>)
tensor([[-2.2412,  0.4837,  0.2215,  0.3090, -1.6671,  0.4123,  0.1556,  0.4466,  0.1015,  0.9674,
         -0.3920, -0.3790, -0.5708,  0.3884,  0.0887, -0.6818, -1.1605,  0.5678, -0.3892,  0.6791,
          0.1930, -0.0099,  0.7034,  0.4553,  0.0086,  0.1226,  0.2061,  0.

In [23]:
# sampling
dict_map = {0: "小", 1: "冬", 2: "瓜"}
prob = torch.tensor([0.3, 0.3, 0])
print("prob : ", prob)

print("根据概率权重所选择next token的下标")
for i in range(100):
    next_token = torch.multinomial(prob, num_samples=1)[0]
    print(f"第{i}次sample的next_token下标为{next_token}:",
          dict_map[int(next_token)])

prob :  tensor([0.3000, 0.3000, 0.0000])
根据概率权重所选择next token的下标
第0次sample的next_token下标为1: 冬
第1次sample的next_token下标为1: 冬
第2次sample的next_token下标为1: 冬
第3次sample的next_token下标为0: 小
第4次sample的next_token下标为1: 冬
第5次sample的next_token下标为1: 冬
第6次sample的next_token下标为0: 小
第7次sample的next_token下标为1: 冬
第8次sample的next_token下标为1: 冬
第9次sample的next_token下标为1: 冬
第10次sample的next_token下标为0: 小
第11次sample的next_token下标为0: 小
第12次sample的next_token下标为0: 小
第13次sample的next_token下标为1: 冬
第14次sample的next_token下标为0: 小
第15次sample的next_token下标为1: 冬
第16次sample的next_token下标为0: 小
第17次sample的next_token下标为0: 小
第18次sample的next_token下标为0: 小
第19次sample的next_token下标为0: 小
第20次sample的next_token下标为0: 小
第21次sample的next_token下标为1: 冬
第22次sample的next_token下标为1: 冬
第23次sample的next_token下标为1: 冬
第24次sample的next_token下标为1: 冬
第25次sample的next_token下标为1: 冬
第26次sample的next_token下标为0: 小
第27次sample的next_token下标为0: 小
第28次sample的next_token下标为1: 冬
第29次sample的next_token下标为1: 冬
第30次sample的next_token下标为0: 小
第31次sample的next_token下标为1: 冬
第32次sample的next_to

## 7. Huggingface Transformer Generate

In [25]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

torch_device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained("gpt2")

# add the EOS token as PAD token to avoid warnings
model_GPT = AutoModelForCausalLM.from_pretrained("/Volumes/WD_BLACK/models/gpt2",
                                                 pad_token_id=tokenizer.eos_token_id).to(torch_device)

  from .autonotebook import tqdm as notebook_tqdm


In [26]:
text = "pneumonoultramicroscopicsilicovolcanoconiosis "
# device = "cuda:0"

print(text)

inputs = tokenizer(text, return_tensors="pt")

print(inputs['input_ids'])
print(inputs['input_ids'][0, 0])
print(inputs['input_ids'][0, 1])
print(inputs['input_ids'][0, 2])


outputs = model_GPT.generate(**inputs, max_new_tokens=50)
print(outputs[0])

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

pneumonoultramicroscopicsilicovolcanoconiosis 
tensor([[   79, 25668,   261, 25955,   859,  2500,  1416,   404,   873, 41896,   709,   349,  5171,
         36221, 42960,   220]])
tensor(79)
tensor(25668)
tensor(261)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([   79, 25668,   261, 25955,   859,  2500,  1416,   404,   873, 41896,   709,   349,  5171,
        36221, 42960,   220,  1924,  1169,  2611,    11,   220,  1924,  1169,  2611,    11,   220,
         1924,  1169,  2611,    11,   220,  1924,  1169,  2611,    11,   220,  1924,  1169,  2611,
           11,   220,  1924,  1169,  2611,    11,   220,  1924,  1169,  2611,    11,   220,  1924,
         1169,  2611,    11,   220,  1924,  1169,  2611,    11,   220,  1924,  1169,  2611,    11,
          220])
pneumonoultramicroscopicsilicovolcanoconiosis erythema, erythema, erythema, erythema, erythema, erythema, erythema, erythema, erythema, erythema, 


In [27]:
# Generate Greedy Search

# encode context the generation is conditioned on
model_inputs = tokenizer('I enjoy walking with my cute dog',
                         return_tensors='pt').to(torch_device)

# generate 40 new tokens
greedy_output = model_GPT.generate(**model_inputs, max_new_tokens=40)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))  # skip_special_tokens=True, 去掉特殊token

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my dog.

I'm not sure


In [28]:
# Generate Beam search
# activate beam search and early_stopping
beam_output = model_GPT.generate(
    **model_inputs,
    max_new_tokens=40,
    num_beams=5,
    early_stopping=True
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(beam_output[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with him again.

I'm not sure if I'll ever be able to walk with him again. I'm not sure


In [29]:
# Generate Beam Searching
# set no_repeat_ngram_size to 2
beam_output = model_GPT.generate(
    **model_inputs,
    max_new_tokens=40,
    num_beams=5,
    no_repeat_ngram_size=2,  # 表示不重复的ngram大小
    early_stopping=True
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(beam_output[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with him again.

I've been thinking about this for a while now, and I think it's time for me to


In [30]:
# Generate beam searching
# set return_num_sequences > 1
beam_outputs = model_GPT.generate(
    **model_inputs,
    max_new_tokens=40,
    num_beams=5,
    no_repeat_ngram_size=2,
    num_return_sequences=5,  # 返回5个序列
    early_stopping=True
)

# now we have 3 output sequences
print("Output:\n" + 100 * '-')
for i, beam_output in enumerate(beam_outputs):
  print("{}: {}".format(i, tokenizer.decode(
      beam_output, skip_special_tokens=True)))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
0: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with him again.

I've been thinking about this for a while now, and I think it's time for me to
1: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with her again.

I've been thinking about this for a while now, and I think it's time for me to
2: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with him again.

I've been thinking about this for a while now, and I think it's a good idea to
3: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with him again.

I've been thinking about this for a while now, and I think it's time to take a
4: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with him again.

I've been thinking about this for a while now, and I think it's a good idea

In [31]:
# sampling
# set seed to reproduce results. Feel free to change the seed though to get different results
from transformers import set_seed
set_seed(13)

# activate sampling and deactivate top_k by setting top_k sampling to 0
sample_output = model_GPT.generate(
    **model_inputs,
    max_new_tokens=40,
    do_sample=False,  # 采样，False表示贪婪搜索，True表示采样
    top_k=0
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my dog.

I'm not sure


In [32]:
# sampling temparature
# set seed to reproduce results. Feel free to change the seed though to get different results
set_seed(42)

# use temperature to decrease the sensitivity to low probability candidates
sample_output = model_GPT.generate(
    **model_inputs,
    max_new_tokens=40,
    do_sample=True,
    top_k=0,
    temperature=0.6,
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
I enjoy walking with my cute dog, and I was delighted to have him on my show, so I had a chance to see him. I was very impressed with his body, and I am looking forward to seeing what he has to


In [33]:
# Top-K Sampling
# set seed to reproduce results. Feel free to change the seed though to get different results
set_seed(42)

# set top_k to 50
sample_output = model_GPT.generate(
    **model_inputs,
    max_new_tokens=40,
    do_sample=True,
    top_k=50
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
I enjoy walking with my cute dog, which is a little unusual in this part of our family. He's a friendly, calm kind of dog, and I've always wanted to have him around, and I always wanted to go with


In [34]:
# Top-p (nucleus) sampling
# set seed to reproduce results. Feel free to change the seed though to get different results
set_seed(42)

# set top_k to 50
sample_output = model_GPT.generate(
    **model_inputs,
    max_new_tokens=40,
    do_sample=True,
    top_p=0.92,
    top_k=50
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
I enjoy walking with my cute dog, which is a little unusual in this part of our family. He's a friendly, calm, kind, caring person. He always makes us happy. The other dog I've gotten is an American


In [35]:
# set seed to reproduce results. Feel free to change the seed though to get different results
set_seed(12)

# set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
sample_outputs = model_GPT.generate(
    **model_inputs,
    max_new_tokens=40,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    num_return_sequences=3,
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
  print("{}: {}".format(i, tokenizer.decode(
      sample_output, skip_special_tokens=True)))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
0: I enjoy walking with my cute dog, but there's another side to that. Because I am a single mom, and the fact that I am doing this kind of thing for my dog has caused me a lot of stress. Especially since
1: I enjoy walking with my cute dog and always try not to let the dog get close to me. I would also suggest getting to know him better as he loves to be led by me and his paws which help in moving me. If
2: I enjoy walking with my cute dog, and she's never been a shy, sweet, or carefree person in her life. She loves to be hugged, and I'm so sorry we never found that out."

On Facebook
