In [None]:
import sys
sys.path.append("minGPT")

import torch
from minGPT.mingpt.model import GPT
from minGPT.mingpt.utils import set_seed
from minGPT.mingpt.bpe import BPETokenizer

set_seed(3407)

# 设置API密钥
import os
os.environ["DASHSCOPE_API_KEY"] = ""

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_type = 'gpt2'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# For this torch version, use torch.cuda.amp.GradScaler (newer torch supports torch.amp.GradScaler)
scaler = torch.cuda.amp.GradScaler() if device == 'cuda' else None
print(f"GradScaler enabled: {scaler is not None}")

tokenizer = BPETokenizer()


Using device: cuda
GradScaler enabled: True


  scaler = torch.cuda.amp.GradScaler() if device == 'cuda' else None


In [3]:
# 调用minGPT
# 如果要关掉记忆模块，令types=None即可
model, _ = GPT.from_pretrained(model_type, types="nm", model_dir="gpt2_local")
model.to(device);

number of parameters: 124.44M


In [4]:
from minGPT.mingpt.program_dataset import ProgramDataset
from torch.utils.data import DataLoader
from dynamic_cheatsheet import DynamicCheatsheetMemory
from dynamic_cheatsheet.config_loader import load_config
from dataclasses import asdict
import random

# 准备数据集
train_dataset = ProgramDataset(
    jsonl_path="./data_reverse_dropvowel/train.jsonl",
    block_size=1024,
    tokenizer=tokenizer,
)

# 得到一个batch的数据
def get_one_batch(dataset, batch_size):
    x = []
    y = []
    prompts = []
    answers = []

    for _ in range(batch_size):
        i = random.randrange(len(dataset))
        idx, target, prompt, answer = dataset[i]
        x.append(idx)
        y.append(target)
        prompts.append(prompt)
        answers.append(answer)
    
    x = torch.stack(x, dim=0)
    y = torch.stack(y, dim=0)
    
    return x, y, prompts, answers

x, y, prompts, answers = get_one_batch(train_dataset, batch_size=32)

print("输入形状：", x.shape)
print("目标形状：", y.shape)
print("prompt的类型", type(prompts), len(prompts))
print("prompt的第15个元素：", prompts[15])
print("answer的类型", type(answers), len(answers))
print("answer的第15个元素：", answers[15])

输入形状： torch.Size([32, 1024])
目标形状： torch.Size([32, 1024])
prompt的类型 <class 'list'> 32
prompt的第15个元素： Task: Reverse the string and remove vowels (a,e,i,o,u).
Input: wcrwbjdqprjw
Output:
answer的类型 <class 'list'> 32
answer的第15个元素： wjrpqdjbwrcw


In [5]:
cfg = load_config("dynamic_cheatsheet/config.yaml")
# hidden_dim 必须 = GPT 的 n_embd（gpt2 是 768）
hidden_dim = model.transformer.wpe.embedding_dim
dynamic_cheatsheet = DynamicCheatsheetMemory(asdict(cfg.dc), hidden_dim)
print(hidden_dim, dynamic_cheatsheet.dc_len, dynamic_cheatsheet.embed_dim)

dc_memory, _ = dynamic_cheatsheet.retrieve(prompts, batch_size=32, device=device)
print("dc_memory type:", type(dc_memory))
print("dc_memory shape:", dc_memory.shape)

768 8 1024
dc_memory type: <class 'torch.Tensor'>
dc_memory shape: torch.Size([32, 8, 768])


In [None]:
# 结果评估函数
import Levenshtein

def exact_match(pred: str, gold: str) -> float:
    return float(pred.strip() == gold.strip())

def normalized_edit_similarity(pred: str, gold: str) -> float:
    if len(gold) == 0:
        return float(len(pred) == 0)

    dist = Levenshtein.distance(pred, gold)
    return 1.0 - dist / max(len(pred), len(gold))

VOWELS = set("aeiouAEIOU")

def remove_vowels(s: str) -> str:
    return "".join(c for c in s if c not in VOWELS)

def reverse_string(s: str) -> str:
    return s[::-1]

def vowel_removal_accuracy(pred: str) -> float:
    return float(all(c not in VOWELS for c in pred))

def reverse_consistency(pred: str, gold: str) -> float:
    return float(pred == gold[::-1])

def evaluate_answer(pred: str, gold: str) -> dict:
    pred = pred.strip()
    gold = gold.strip()

    metrics = {
        "exact_match": exact_match(pred, gold),
        "edit_similarity": normalized_edit_similarity(pred, gold),
        "no_vowel": vowel_removal_accuracy(pred),
        "pred_len": len(pred),
        "gold_len": len(gold),
        "len_accuracy": float(len(pred)) / len(gold) if len(gold) > 0 else 0.0,
    }
    return metrics


def evaluate_batch(preds, golds):
    all_metrics = []
    for p, g in zip(preds, golds):
        all_metrics.append(evaluate_answer(p, g))

    # 求平均
    avg = {}
    for k in all_metrics[0]:
        avg[k] = sum(m[k] for m in all_metrics) / len(all_metrics)
    return avg


In [7]:
# 冻结部分参数
for param in model.parameters():
    param.requires_grad = False
if getattr(model, "neural_memory", None) is not None:
    for p in model.neural_memory.parameters():
        p.requires_grad = True
if hasattr(model, "dc_gate"):
    for p in model.dc_gate.parameters():
        p.requires_grad = True
if hasattr(model, "nm_gate"):
    for p in model.nm_gate.parameters():
        p.requires_grad = True

# 获取可训练参数及其名称
trainable_params = [(n, p) for n, p in model.named_parameters() if p.requires_grad]

# print("Trainable param groups:")
# for n, p in trainable_params:
#     print(f"  {n:60s} {p.numel()}")
# print("Total trainable:", sum(p.numel() for _, p in trainable_params))

# 只传递参数对象给优化器
optimizer = torch.optim.AdamW([p for _, p in trainable_params], lr=3e-4)
scaler = torch.cuda.amp.GradScaler()

max_iters = 1000
batch_size = 10
EOS_ID = 50256

for iter in range(max_iters):
    x, y, prompts, answers = get_one_batch(train_dataset, batch_size=batch_size)
    x = x.to(device)
    y = y.to(device)

    with torch.no_grad():
        dc_memory, _ = dynamic_cheatsheet.retrieve(prompts, batch_size=batch_size, device=device)
        dc_memory = dc_memory.detach().to(device)

    optimizer.zero_grad()
    with torch.cuda.amp.autocast():
        logits, loss = model(x, y, dc_memory=dc_memory)

    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()
    
    # 开启评估模式，生成文本并更新记忆
    model.eval()
    dc_update_str = ""
    gen = []
    for i in range(batch_size):
        prompt_idx = tokenizer(prompts[i])[0].long().unsqueeze(0).to(device)
        gen_ids = model.generate(
            prompt_idx,
            max_new_tokens=20,
            temperature=0.8,
            do_sample=True,
            top_k=None,
            dc_memory=dc_memory[i:i+1],
            eos_token_id=EOS_ID,
            return_only_generated=True,
        )
        gen_text = tokenizer.decode(gen_ids[0])
        gen.append(gen_text)
        # print("输入", prompts[i])
        # print("答案", answers[i])
        # print("生成文本", gen_text)
        dc_update_str += prompts[i] + "\n" + answers[i] + "\n" + "GPT output:" + gen_text + "\n" 
    
    with torch.no_grad():
        dynamic_cheatsheet.update(dc_update_str, device=device)

    if iter % 100 == 0:
        print(f"迭代 {iter}，损失 {loss.item():.4f}")
        metrics = evaluate_batch(gen, answers)
        print(f"迭代 {iter}，评估指标: {metrics}")

  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():


迭代 0，损失 5.8840
迭代 0，评估指标: {'exact_match': 0.0, 'edit_similarity': 0.08350459882078212, 'no_vowel': 0.0, 'pred_len': 47.5, 'gold_len': 7.5, 'len_accuracy': 6.944451381951382}
迭代 100，损失 4.3282
迭代 100，评估指标: {'exact_match': 0.0, 'edit_similarity': 0.1846192733770719, 'no_vowel': 0.6, 'pred_len': 25.3, 'gold_len': 10.2, 'len_accuracy': 2.7539288489288487}
迭代 200，损失 3.7593
迭代 200，评估指标: {'exact_match': 0.0, 'edit_similarity': 0.18242246011204272, 'no_vowel': 0.7, 'pred_len': 25.4, 'gold_len': 7.9, 'len_accuracy': 3.306111111111112}
迭代 300，损失 3.9779
迭代 300，评估指标: {'exact_match': 0.0, 'edit_similarity': 0.19692107409498713, 'no_vowel': 1.0, 'pred_len': 25.9, 'gold_len': 9.3, 'len_accuracy': 3.2909784659784664}
迭代 400，损失 3.4315
迭代 400，评估指标: {'exact_match': 0.0, 'edit_similarity': 0.184089893634315, 'no_vowel': 0.8, 'pred_len': 26.2, 'gold_len': 9.1, 'len_accuracy': 3.051370851370851}
迭代 500，损失 3.2935
迭代 500，评估指标: {'exact_match': 0.0, 'edit_similarity': 0.1840142627099149, 'no_vowel': 1.0, 'pred_l

In [8]:
model.eval();

# 准备数据集
test_dataset = ProgramDataset(
    jsonl_path="./data_reverse_dropvowel/test.jsonl",
    block_size=1024,
    tokenizer=tokenizer,
)

# 得到一个batch的数据
test_size = 200
x, y, prompts, answers = get_one_batch(test_dataset, batch_size=test_size)
gen = []

for i in range(test_size):

    with torch.no_grad():
        dc_memory, _ = dynamic_cheatsheet.retrieve([prompts[i]], batch_size=1, device=device)
    dc_update_str = ""
    dc_memory = dc_memory.detach().to(device)
    prompt_idx = tokenizer(prompts[i])[0].long().unsqueeze(0).to(device)
    gen_ids = model.generate(
        prompt_idx,
        max_new_tokens=20,
        temperature=1.0,
        do_sample=False,
        top_k=None,
        dc_memory=dc_memory,
        eos_token_id=EOS_ID,
        return_only_generated=True,
    )
    gen_text = tokenizer.decode(gen_ids[0])
    gen.append(gen_text)
    dc_update_str += prompts[i] + "\n" + answers[i] + "\n" + "GPT output:" + gen_text + "\n" 
    
    if i % 20 == 0:
        with torch.no_grad():
            dynamic_cheatsheet.update(dc_update_str, device=device)


metrics = evaluate_batch(gen, answers)
print(f"test评估指标: {metrics}")


test评估指标: {'exact_match': 0.0, 'edit_similarity': 0.2047460183573084, 'no_vowel': 0.905, 'pred_len': 24.175, 'gold_len': 8.71, 'len_accuracy': 3.207119866244867}
