# 使用Cuda并行

In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

block_size = 8
batch_size = 4
max_iters = 10000
learning_rate = 3e-4
eval_iters = 250 # 每xx次迭代eval一次

cuda


In [2]:
with open('./wizard_of_oz.txt', 'r', encoding='utf-8') as f:
    text = f.read()
# 文本中所有字符的集合
chars = sorted(set(text))
print(chars)
vocab_size = len(chars)

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']


In [3]:
# 字符到索引的映射
string_to_int = {ch:i for i, ch in enumerate(chars)}
int_to_string = {i:ch for i, ch in enumerate(chars)}
# encoder 和 decoder
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])  # 接受一个整数列表 l 转换为字符串

# 创建数据元素，对整个文本编码
data = torch.tensor(encode(text), dtype=torch.long)
# 打印前一百个字符
print(data[: 100])

tensor([80,  1,  1, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,
         1, 47, 33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26,
        49,  0,  0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,
         0,  0,  1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1,
        47, 33, 50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1,
        36, 25, 38, 28,  1, 39, 30,  1, 39, 50])


## 分割数据集

In [4]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == "train" else val_data
    # 随机一个batch的索引，从0到(len(data)-block_size), 一个batch有batch_size个
    # batch_size = 4
    ix = torch.randint(len(data) - block_size, (batch_size,))
    # print(ix)
    # 分批堆叠
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1: i+block_size+1] for i in ix])
    # 将数据放在GPU
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch("train")
print("inputs:")
print(x)
print("targets:")
print(y)

inputs:
tensor([[ 1, 58, 54, 71, 73, 61, 70, 74],
        [72, 67, 54, 71, 65, 58, 57,  1],
        [66, 58,  1,  5, 50, 58, 55, 11],
        [ 1, 54,  1, 56, 74, 71, 75, 58]], device='cuda:0')
targets:
tensor([[58, 54, 71, 73, 61, 70, 74, 54],
        [67, 54, 71, 65, 58, 57,  1, 29],
        [58,  1,  5, 50, 58, 55, 11,  5],
        [54,  1, 56, 74, 71, 75, 58, 23]], device='cuda:0')


# 模型

In [5]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # 嵌入表 vocab_size * vocab_size
        self.token_embeddings_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, index, targets=None):
        # logits
        logits = self.token_embeddings_table(index)
        if targets is None:
            loss = None
        else:
            # batch, time(sequence), channels(vocab_size)
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    # 生成token的函数
    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        # 根据需要生成的token数进行循环
        for _ in range(max_new_tokens):
            # 获得预测， 这里没有给target，返回的logits是三维的 B*T*C
            logits, loss = self.forward(index)
            # print("shape", logits.shape)
            # 只关心最后一个时间步(time step)
            logits = logits[:, -1, :] # become (B, C)
            # 应用Softmax，得到概率分布 只应用最后一维
            probs = F.softmax(logits, dim=-1) # (B, C)
            # 从分布中取样
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # 增加取样的索引到当前序列
            index = torch.cat((index, index_next), dim=1)  # (B, T+1)
        return index
    
model = BigramLanguageModel(vocab_size)
m = model.to(device)

# 模拟一下
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)            


H
iyJ6v:iuH
1-rK4"85mi?j(71(8XL2G7:45?.yrm&q&hc:OGlle]mBMBPO[_Kv&EGIAdbkGX]zk
P*mnlT-y
8hoRH:asqqktRhbqEU
Ym[C5jar)n&4pG[l7EMBm﻿7Vsu Y8e,kb(R0KVTD'Jk8'A?L'U)!u.)Ecn:s(L,P_fWK0gG jAL!)5M9T-za6pUFX&MPn7!AnS*DKql4
TSrDkYFX1pJ﻿"IBS﻿"MMtLlRHmT-R*xo"K"U4txh7lVA7Nx)2)5QM7oSiH6H:ytA 17R*,
&SwnSI_ p[mEA7-C. Ajc
O(22R,:SA?C.gwDC"g X1-3""JW;IwLhv.Q _L.KQ!)5*uF-R]r6 DQkvuh.2*eDz d0BmT-)PBi&LBBi9-4M1j;z!9_TDYp0b]j(68VBtqES(SOj(9W]NDfC]
:RkNgw50m4nCES
1&l70IMPGUI!,':"uIk9z﻿Bu9za1&1q4_:d3DGqHMuOj0g:bq&lT'A"XjM


In [24]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ["train", "val"]:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

# 优化器

In [28]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    # sample
    xb, yb = get_batch("train")
    
    # evaluate
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True) # 这里设置None会比设置0占用空间小很多
    loss.backward()
    optimizer.step()
    if (iter) % eval_iters == 0:
        # print(f"step:{iter}; loss:{loss.item():.5f}")
        losses = estimate_loss()
        print(f"step:{iter}; train loss:{losses['train']:.5f}, val loss:{losses['val']:.5f}")


step:0; train loss:2.44432, val loss:2.46581
step:250; train loss:2.45126, val loss:2.50458
step:500; train loss:2.43271, val loss:2.49043
step:750; train loss:2.43354, val loss:2.50653
step:1000; train loss:2.42062, val loss:2.48813
step:1250; train loss:2.44351, val loss:2.49834
step:1500; train loss:2.43567, val loss:2.47759
step:1750; train loss:2.44333, val loss:2.48188
step:2000; train loss:2.42228, val loss:2.49519
step:2250; train loss:2.43632, val loss:2.49121
step:2500; train loss:2.43401, val loss:2.47410
step:2750; train loss:2.46757, val loss:2.48198
step:3000; train loss:2.42432, val loss:2.50796
step:3250; train loss:2.47179, val loss:2.47594
step:3500; train loss:2.44430, val loss:2.48144
step:3750; train loss:2.44096, val loss:2.49922
step:4000; train loss:2.43282, val loss:2.47988
step:4250; train loss:2.42430, val loss:2.49980
step:4500; train loss:2.42680, val loss:2.47165
step:4750; train loss:2.44125, val loss:2.47087
step:5000; train loss:2.41843, val loss:2.4679

In [29]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


"ber,
"Wh,"As win ch avenothexpar fad y themede thre tegabethest awathe e y

bup tote wod theverro 2. tcind e, se ththed ave t, mame mery O]Zg  ce no Eulvardy

any, ousstt, than t t worof m it o an t lvier sn'le ctthar, a s  ilornthe, ngny whepanqul, list whe, toors  ad We kilafrd talas F tardibst; re t--Vleaf as. the calincoot abutoned 16[k; nd, u. ppo m atheridnksal thimithel s l gre g shit the lasizan pus t g ple
"an hest w avern sthr toth heriowereathese Wid OO-llye hndithe sintint "Ther oun
