In [1]:
import sys
sys.path.append("minGPT")   # 让 mingpt 这个包可 import

import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from minGPT.mingpt.model import GPT
from minGPT.mingpt.utils import set_seed
from minGPT.mingpt.bpe import BPETokenizer

set_seed(3407)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_type = 'gpt2'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = BPETokenizer()

In [3]:
# 调用minGPT
# 如果要关掉记忆模块，令types=None即可
model, _ = GPT.from_pretrained(model_type, types="nm")
model.to(device);

number of parameters: 124.44M


In [4]:
from minGPT.mingpt.program_dataset import ProgramDataset
from torch.utils.data import DataLoader
from dynamic_cheatsheet import DynamicCheatsheetMemory
from dynamic_cheatsheet.config_loader import load_config
from dataclasses import asdict
import random

# 准备数据集
train_dataset = ProgramDataset(
    jsonl_path="./data_reverse_dropvowel/train.jsonl",
    block_size=1024,
    tokenizer=tokenizer,
)

# 得到一个batch的数据
def get_one_batch(dataset, batch_size):
    x = []
    y = []
    prompts = []
    answers = []

    for _ in range(batch_size):
        i = random.randrange(len(dataset))
        idx, target, prompt, answer = dataset[i]
        x.append(idx)
        y.append(target)
        prompts.append(prompt)
        answers.append(answer)
    
    x = torch.stack(x, dim=0)
    y = torch.stack(y, dim=0)
    
    return x, y, prompts, answers

x, y, prompts, answers = get_one_batch(train_dataset, batch_size=32)

print("输入形状：", x.shape)
print("目标形状：", y.shape)
print("prompt的类型", type(prompts), len(prompts))
print("prompt的第15个元素：", prompts[15])
print("answer的类型", type(answers), len(answers))
print("answer的第15个元素：", answers[15])

输入形状： torch.Size([32, 1024])
目标形状： torch.Size([32, 1024])
prompt的类型 <class 'list'> 32
prompt的第15个元素： Task: Reverse the string and remove vowels (a,e,i,o,u).
Input: wcrwbjdqprjw
Output:
answer的类型 <class 'list'> 32
answer的第15个元素： wjrpqdjbwrcw


In [5]:
cfg = load_config("dynamic_cheatsheet/config.yaml")
# hidden_dim 必须 = GPT 的 n_embd（gpt2 是 768）
hidden_dim = model.transformer.wpe.embedding_dim
dynamic_cheatsheet = DynamicCheatsheetMemory(asdict(cfg.dc), hidden_dim)
print(hidden_dim, dynamic_cheatsheet.dc_len, dynamic_cheatsheet.embed_dim)

dc_memory, _ = dynamic_cheatsheet.retrieve(prompts, batch_size=32, device=device)
print("dc_memory type:", type(dc_memory))
print("dc_memory shape:", dc_memory.shape)

768 8 1024
dc_memory type: <class 'torch.Tensor'>
dc_memory shape: torch.Size([32, 8, 768])


In [7]:
# 冻结部分参数
for param in model.parameters():
    param.requires_grad = False
if getattr(model, "neural_memory", None) is not None:
    for p in model.neural_memory.parameters():
        p.requires_grad = True
if hasattr(model, "dc_gate"):
    for p in model.dc_gate.parameters():
        p.requires_grad = True
if hasattr(model, "nm_gate"):
    for p in model.nm_gate.parameters():
        p.requires_grad = True

# 获取可训练参数及其名称
trainable_params = [(n, p) for n, p in model.named_parameters() if p.requires_grad]

# print("Trainable param groups:")
# for n, p in trainable_params:
#     print(f"  {n:60s} {p.numel()}")
# print("Total trainable:", sum(p.numel() for _, p in trainable_params))

# 只传递参数对象给优化器
optimizer = torch.optim.AdamW([p for _, p in trainable_params], lr=3e-4)

max_iters = 3
batch_size = 2
EOS_ID = 50256

for iter in range(max_iters):
    x, y, prompts, answers = get_one_batch(train_dataset, batch_size=batch_size)
    x = x.to(device)
    y = y.to(device)

    with torch.no_grad():
        dc_memory, _ = dynamic_cheatsheet.retrieve(prompts, batch_size=batch_size, device=device)
    
    model.train()
    logits, loss = model(x, y, dc_memory = dc_memory)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if iter % 100 == 0:
        print(f"迭代 {iter}，损失 {loss.item():.4f}")


    model.eval()
    for i in range(batch_size):
        prompt_idx = tokenizer(prompts[i])[0].long().unsqueeze(0).to(device)
        gen_ids = model.generate(
            prompt_idx,
            max_new_tokens=30,
            temperature=1.0,
            do_sample=False,
            top_k=None,
            dc_memory=dc_memory[i:i+1],
            eos_token_id=EOS_ID,
            return_only_generated=False,
        )
        gen_text = tokenizer.decode(gen_ids[0])
        with torch.no_grad():
            dynamic_cheatsheet.update(gen_text, device=device)

    if iter % 100 == 0:
        print(f"评估模式下，迭代 {iter}，损失 {loss.item():.4f}")


迭代 0，损失 5.9561


RuntimeError: Missing API key: please set environment variable DASHSCOPE_API_KEY

In [None]:
def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer.set_callback('on_batch_end', batch_end_callback)

In [None]:
# 冻结部分参数
for param in model.parameters():
    param.requires_grad = False
if getattr(model, "neural_memory", None) is not None:
    for p in model.neural_memory.parameters():
        p.requires_grad = True
for block in model.transformer.h:
    if hasattr(block, "dc_gate"):
        for p in block.dc_gate.parameters():
            p.requires_grad = True
    if hasattr(block, "gate"):
        for p in block.gate.parameters():
            p.requires_grad = True

trainer.run()
# 这里的dc_memory为None，在gpt模型中会变成全0张量

In [None]:
model.eval()

In [None]:
# 调用生成

prompt = "Once upon a time"
dc_memory = dynamic_cheatsheet.retrieve(prompt, batchsize=1, device=device)
# 这里的batchsize需要在源代码中修改，对齐维度
encoded = tokenizer_hf(prompt, return_tensors='pt')
idx2 = encoded['input_ids'].to(device) # (1,T) LongTensor
target = ()
# target 来自 dataset，根据dataset的定义界定是否需要像idx一样处理

with torch.no_grad():
    cat = model.generate(idx2, n, do_sample=False, dc_memory=dc_memory)[0]
    out = tokenizer_hf.decode(cat.cpu().squeeze())
    # 注意参数对齐
dynamic_cheatsheet.update(out, device=device, max_entries=100)
# 这里的max_entries和谁对齐？


In [None]:
# 可以生成评估函数
def evaluate_model(model, eval_dataset, dc_memory=None):
    