In [None]:
# 导入 GPT-2 语言模型头部
from transformers import GPT2LMHeadModel

In [None]:
# 加载预训练的 GPT-2 模型（124M参数版本）
model_hf = GPT2LMHeadModel.from_pretrained("gpt2") # 124M
# 获取模型的状态字典
sd_hf = model_hf.state_dict()

# 打印每个参数张量的形状
for k, v in sd_hf.items():
    print(k, v.shape)

In [None]:
# 查看位置编码权重矩阵的前20个元素
sd_hf["transformer.wpe.weight"].view(-1)[:20]

In [None]:
# 导入绘图库
import matplotlib.pyplot as plt
%matplotlib inline

# 可视化位置编码权重矩阵
plt.imshow(sd_hf["transformer.wpe.weight"], cmap="gray")

In [None]:
# 绘制位置编码权重矩阵在不同列上的分布
plt.plot(sd_hf["transformer.wpe.weight"][:, 150])
plt.plot(sd_hf["transformer.wpe.weight"][:, 200])
plt.plot(sd_hf["transformer.wpe.weight"][:, 250])

In [None]:
# 可视化注意力权重矩阵的一部分
plt.imshow(sd_hf["transformer.h.1.attn.c_attn.weight"][:300,:300], cmap="gray")

In [None]:
# 使用 Hugging Face 的 pipeline 进行文本生成
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model='gpt2')
set_seed(42)
generator("Hello, I'm a language model,", max_length=30, num_return_sequences=5)


In [None]:
# 手动实现文本生成采样
import torch
from torch.nn import functional as F

# 加载模型并设置为评估模式
model = GPT2LMHeadModel.from_pretrained("gpt2") # 124M
model.eval()
model.to('cuda')

# 设置随机种子以确保可重复性
torch.manual_seed(42)
torch.cuda.manual_seed(42)

# 准备输入序列 "Hello, I'm a language model,"
tokens = [15496, 11, 314, 1101, 257, 3303, 2746, 11]
tokens = torch.tensor(tokens, dtype=torch.long) # (8,)
tokens = tokens.unsqueeze(0).repeat(5, 1) # (5, 8)
x = tokens.to('cuda')

# 生成文本！
while x.size(1) < 30: # max_length=30
    # 前向传播获取 logits
    with torch.no_grad():
        logits = model(x)[0] # (B, T, vocab_size)
        # 获取最后一个位置的 logits
        logits = logits[:, -1, :] # (B, vocab_size)
        # 计算概率分布
        probs = F.softmax(logits, dim=-1)
        # 进行 top-k 采样（使用 Hugging Face pipeline 默认值 50）
        # topk_probs 形状为 (5, 50), topk_indices 形状为 (5, 50)
        topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
        # 从 top-k 概率中选择一个 token
        # 注意：multinomial 不要求输入和为 1
        ix = torch.multinomial(topk_probs, 1) # (B, 1)
        # 收集对应的索引
        xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
        # 将新 token 添加到序列中
        x = torch.cat((x, xcol), dim=1)

# 打印生成的文本
import tiktoken
enc = tiktoken.get_encoding('gpt2')
for i in range(5):
    tokens = x[i, :30].tolist()
    decoded = enc.decode(tokens)
    print(">", decoded)

In [None]:
# 加载莎士比亚数据集
# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r') as f:
    text = f.read()
data = text[:1000] # 取前1000个字符
print(data[:100])

In [None]:
# 使用 tiktoken 对文本进行编码
import tiktoken
enc = tiktoken.get_encoding('gpt2')
tokens = enc.encode(data)
print(tokens[:24])

In [None]:
# 准备输入和目标序列
import torch
buf = torch.tensor(tokens[:24 + 1])
x = buf[:-1].view(4, 6)
y = buf[1:].view(4, 6)
print(x)
print(y)

In [None]:
# 检查词嵌入权重矩阵的形状
print(sd_hf["lm_head.weight"].shape)
print(sd_hf["transformer.wte.weight"].shape)

In [None]:
# 检查词嵌入权重是否相同
(sd_hf["lm_head.weight"] == sd_hf["transformer.wte.weight"]).all()

In [None]:
# 检查词嵌入权重的内存地址
print(sd_hf["lm_head.weight"].data_ptr())
print(sd_hf["transformer.wte.weight"].data_ptr())

In [None]:
# 演示残差流中标准差的增长
x = torch.zeros(768)
n = 100 # 例如100层
for i in range(n):
    x += n**-0.5 * torch.randn(768)

print(x.std())

In [None]:
# 创建一个简单的 MLP 网络
import torch

net = torch.nn.Sequential(
    torch.nn.Linear(16, 32),
    torch.nn.GELU(),
    torch.nn.Linear(32, 1)
)
torch.random.manual_seed(42)
x = torch.randn(4, 16)
y = torch.randn(4, 1)
net.zero_grad()
yhat = net(x)
loss = torch.nn.functional.mse_loss(yhat, y)
loss.backward()
print(net[0].weight.grad.view(-1)[:10])

# 这里的损失目标（由于 reduction='mean'）是：
# L = 1/4 * [
#            (y[0] - yhat[0])**2 +
#            (y[1] - yhat[1])**2 +
#            (y[2] - yhat[2])**2 +
#            (y[3] - yhat[3])**2
#           ]
# 注意：1/4!

In [None]:
# 使用梯度累积步数为4，批次大小为1的情况
# 这里的损失目标不同，因为：
# 梯度累积 <---> 损失求和
# 即我们得到：
# L0 = 1/4(y[0] - yhat[0])**2
# L1 = 1/4(y[1] - yhat[1])**2
# L2 = 1/4(y[2] - yhat[2])**2
# L3 = 1/4(y[3] - yhat[3])**2
# L = L0 + L1 + L2 + L3
# 注意：1/4 的归一化因子丢失了
net.zero_grad()
for i in range(4):
    yhat = net(x[i])
    loss = torch.nn.functional.mse_loss(yhat, y[i])
    loss = loss / 4 # <-- 需要添加回归一化因子！
    loss.backward()
print(net[0].weight.grad.view(-1)[:10])


In [None]:
# 解析和可视化日志文件
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

sz = "124M"

# 设置基准值
loss_baseline = {
    "124M": 3.2924,
}[sz]
hella2_baseline = { # GPT-2 的 HellaSwag 基准
    "124M": 0.294463,
    "350M": 0.375224,
    "774M": 0.431986,
    "1558M": 0.488946,
}[sz]
hella3_baseline = { # GPT-3 的 HellaSwag 基准
    "124M": 0.337,
    "350M": 0.436,
    "774M": 0.510,
    "1558M": 0.547,
}[sz]

# 加载日志文件
with open("log124M_40B/log.txt", "r") as f:
    lines = f.readlines()

# 解析每一行，按流（训练、验证、hella）分组
streams = {}
for line in lines:
    step, stream, val = line.strip().split()
    if stream not in streams:
        streams[stream] = {}
    streams[stream][int(step)] = float(val)

# 将每个流从 {step: val} 转换为 (steps[], vals[])
# 以便于绘图
streams_xy = {}
for k, v in streams.items():
    # 获取所有 (step, val) 项并排序
    xy = sorted(list(v.items()))
    # 解包元组列表为列表元组
    streams_xy[k] = list(zip(*xy))

# 创建图形
plt.figure(figsize=(16, 6))

# 面板1：损失曲线（训练和验证）
plt.subplot(121)
xs, ys = streams_xy["train"] # 训练损失
ys = np.array(ys)
plt.plot(xs, ys, label=f'nanogpt ({sz}) train loss')
print("Min Train Loss:", min(ys))
xs, ys = streams_xy["val"] # 验证损失
plt.plot(xs, ys, label=f'nanogpt ({sz}) val loss')
# GPT-2 基准的水平线
if loss_baseline is not None:
    plt.axhline(y=loss_baseline, color='r', linestyle='--', label=f"OpenAI GPT-2 ({sz}) checkpoint val loss")
plt.xlabel("steps")
plt.ylabel("loss")
plt.yscale('log')
plt.ylim(top=4.0)
plt.legend()
plt.title("Loss")
print("Min Validation Loss:", min(ys))

# 面板2：HellaSwag 评估
plt.subplot(122)
xs, ys = streams_xy["hella"] # HellaSwag 评估
ys = np.array(ys)
plt.plot(xs, ys, label=f"nanogpt ({sz})")
# GPT-2 和 GPT-3 基准的水平线
if hella2_baseline:
    plt.axhline(y=hella2_baseline, color='r', linestyle='--', label=f"OpenAI GPT-2 ({sz}) checkpoint")
if hella3_baseline:
    plt.axhline(y=hella3_baseline, color='g', linestyle='--', label=f"OpenAI GPT-3 ({sz}) checkpoint")
plt.xlabel("steps")
plt.ylabel("accuracy")
plt.legend()
plt.title("HellaSwag eval")
print("Max Hellaswag eval:", max(ys))