In [1]:
from GPT2 import GPT, GPTConfig
import torch
import torch.nn as nn
from torch.nn import functional as F

### 1、测试模型框架搭建的是否合理，能否加载Huggface的gpt2预训练权重

In [2]:
num_return_sequences = 5
max_length = 30


model = GPT.from_pretrained('gpt2')

model.eval()
model.to('cuda')

loading weights from pretrained gpt: gpt2


GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear(in_features=768, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): GELU(approximate='tanh')
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

# -------------------------------------------------------------

### 2、给一段文本，测试能否使用预训练的权重进行回答

In [3]:
num_return_sequences = 5
max_length = 30

model = GPT.from_pretrained('gpt2')

model.eval()
model.to('cuda')

import tiktoken
enc = tiktoken.get_encoding('gpt2')
tokens = enc.encode("Hello, I`m a language model,")
tokens = torch.tensor(tokens, dtype=torch.long) #(8,)
tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1) #(5, 8)
x = tokens.to('cuda')

In [6]:
# 生成回答
torch.manual_seed(42)
torch.cuda.manual_seed(42)

while x.size(1) < max_length:
    # 前向传播获取logits
    logits, _ = model(x)

    logits = logits[:, -1, :] # (B, vocab_size)

    # softmax操作
    probs = F.softmax(logits, dim=-1)

    # 选择前50个， 最有可能的预测
    topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)

    # 从50个中选择一个
    ix = torch.multinomial(topk_probs, 1)

    # 
    xcol = torch.gather(topk_indices, -1, ix)

    x = torch.cat((x, xcol), dim=1)


In [7]:
for i in range(num_return_sequences):
    tokens = x[i, :max_length].tolist()
    decoded = enc.decode(tokens)
    print(">", decoded)

> Hello, I`m a language model, and I like to write programs for you if you're interested in building. You can learn more about programming
> Hello, I`m a language model, not a database. So I want to make a programming language. This is my goal: to create systems
> Hello, I`m a language model, not a science."

Mitt Romney spoke with the Associated Press, saying he does not believe that
> Hello, I`m a language model, you guessed it.

The code below breaks down the way to implement this object into a full set
> Hello, I`m a language model, a language model based on Python, for every class in the database. How do you think this language is


# -------------------------------------------------------------

### 3、从小的数据集中拉取一些数据，分词处理后，测试模型输出的logits是否正常，loss是否正常

In [8]:
device = ('cuda' if torch.cuda.is_available() else 'cpu')

# 准备小批量数据
import tiktoken

enc = tiktoken.get_encoding('gpt2')
with open('input.txt', 'r') as  f:
    text = f.read()

data = text[:1000]
tokens = enc.encode(data)

B, T = 4, 32            # 批量为4， 序列长度为32

import torch
buf = torch.tensor(tokens[:B*T + 1])     # 先转成tensor格式
x = buf[:-1].view(B, T)                 # 小批量的输入，直接调整张量形状就可以做到分出批量了
y = buf[1:].view(B, T) 

x = x.to(device)
y = y.to(device)


In [9]:
# 查看输出和损失是否正常
model = GPT(GPTConfig)
model.to(device)
logits, loss  = model(x, y)
 
print(logits.shape)
print(loss)

torch.Size([4, 32, 50257])
tensor(10.9840, device='cuda:0', grad_fn=<NllLossBackward0>)


# -------------------------------------------------------------

### 4、构建一个简单的循环测试模型参数能否正常更新

In [14]:
device = ('cuda' if torch.cuda.is_available() else 'cpu')

# 准备小批量数据
import tiktoken

enc = tiktoken.get_encoding('gpt2')
with open('input.txt', 'r') as  f:
    text = f.read()

data = text[:1000]
tokens = enc.encode(data)

B, T = 4, 32            # 批量为4， 序列长度为32

import torch
buf = torch.tensor(tokens[:B*T + 1])     # 先转成tensor格式
x = buf[:-1].view(B, T)                 # 小批量的输入，直接调整张量形状就可以做到分出批量了
y = buf[1:].view(B, T) 

x = x.to(device)
y = y.to(device)


# 参数优化器
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)


# 构建简单训练循环
for i in range(50):
    optimizer.zero_grad()

    logits, loss = model(x, y)
    loss.backward()
    optimizer.step()
    print(f"step {i}, loss:{loss.item()}")


step 0, loss:0.0015552322147414088
step 1, loss:0.12253555655479431
step 2, loss:0.007488030008971691
step 3, loss:0.0035188908223062754
step 4, loss:0.0046402993611991405
step 5, loss:0.004251120612025261
step 6, loss:0.003709079697728157
step 7, loss:0.0033120911102741957
step 8, loss:0.002933483337983489
step 9, loss:0.002520725829526782
step 10, loss:0.0021545328199863434
step 11, loss:0.0018859220435842872
step 12, loss:0.0016903379000723362
step 13, loss:0.0015348225133493543
step 14, loss:0.001398595399223268
step 15, loss:0.0012718155048787594
step 16, loss:0.001152080250903964
step 17, loss:0.0010405817301943898
step 18, loss:0.0009397428366355598
step 19, loss:0.0008509515319019556
step 20, loss:0.0007745805778540671
step 21, loss:0.00070955790579319
step 22, loss:0.0006544988718815148
step 23, loss:0.0006077433936297894
step 24, loss:0.0005677159642800689
step 25, loss:0.0005328723345883191
step 26, loss:0.0005021595861762762
step 27, loss:0.0004746438644360751
step 28, loss