In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/camel-xiangzi-1/Camel Xiangzi.txt


In [2]:
import torch.nn as nn
import torch
import torch.nn.functional as F

In [3]:
##
class Head(nn.Module):
 """ one head of self-attention """
 def __init__(self, n_embd, head_size, dropout):
     super().__init__()
     self.key = nn.Linear(n_embd, head_size, bias=False)
     self.query = nn.Linear(n_embd, head_size, bias=False)
     self.value = nn.Linear(n_embd, head_size, bias=False)
     self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
     self.dropout = nn.Dropout(dropout)
 def forward(self, x):
     B,T,C = x.shape
     k = self.key(x) # (B,T,C)
     q = self.query(x) # (B,T,C)
     # ⾃注意⼒计算(相关性)
     wei = q @ k.transpose(-2,-1) * C ** -0.5 # (B,T,C) @ (B,C,T) -> (B,T,T)
     wei = wei.masked_fill(self.tril[:T,:T] == 0, float('-inf')) # (B,T,T)
     wei = F.softmax(wei, dim=-1) # (B,T,T)
     wei = self.dropout(wei)
     # value的加权运算
     v = self.value(x) # (B,T,C)
     out = wei @ v # (B,T,T) @ (B,T,C) - > (B,T,C)
     return out
     

In [4]:
###多头机制
"""
多头机制
"""
class MultiHeadAttention(nn.Module):
 """ multiple heads of self-attention in parallel """
 def __init__(self, num_heads, n_embd, head_size,dropout):
     super().__init__()
     self.heads = nn.ModuleList([Head(n_embd, head_size,dropout) for _ in range(num_heads)])
     self.proj = nn.Linear(n_embd, n_embd)
     self.dropout = nn.Dropout(dropout)
 def forward(self, x):
     out = torch.cat([h(x) for h in self.heads], dim=-1)
     out = self.dropout(self.proj(out))
     return out

In [5]:
##前馈层
class FeedFoward(nn.Module):
 def __init__(self, n_embd, dropout):
     super().__init__()
     self.net = nn.Sequential(
     nn.Linear(n_embd, 4 * n_embd),
     nn.ReLU(),
     nn.Linear(4 * n_embd, n_embd),
     nn.Dropout(dropout),

     )
 def forward(self, x):
     return self.net(x)

In [6]:
###self-attention模块
class Block(nn.Module):
 def __init__(self, n_embd, n_head):
     super().__init__()
    
     head_size = n_embd // n_head
     self.sa = MultiHeadAttention(n_head, n_embd, head_size,dropout)
     self.ffwd = FeedFoward(n_embd,dropout)
     self.ln1 = nn.LayerNorm(n_embd)
     self.ln2 = nn.LayerNorm(n_embd)
 def forward(self, x):
     x = x + self.sa(self.ln1(x)) # 残差连接
     x = x + self.ffwd(self.ln2(x)) # 残差连接
     return x

In [7]:
class BigramLanguageModel(nn.Module):
 def __init__(self):
     super().__init__()
     # 每个token直接输出的logits值作为下⼀个token的映射
     self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
     self.position_embedding_table = nn.Embedding(block_size, n_embd)
     self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
     self.ln_f = nn.LayerNorm(n_embd) # final layer norm
     self.lm_head = nn.Linear(n_embd, vocab_size)
 def forward(self, idx, targets=None):
     B, T = idx.shape
     # idx和target都是维度为 (B,T) 的整型tensor
     tok_emb = self.token_embedding_table(idx) # (B,T,C)
     pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
     x = tok_emb + pos_emb # (B, T, C)
     x = self.blocks(x) # (B,T,C)
     x = self.ln_f(x) # (B,T,C)
     logits = self.lm_head(x) # (B,T, vocab_size)
     if targets is None:
         loss = None
     else:
         B, T, C = logits.shape
         logits = logits.reshape(B * T, C)
         targets = targets.reshape(B * T)
         loss = F.cross_entropy(logits, targets)
     return logits, loss
 def generate(self, idx, max_new_tokens):
     # idx指当前语料集(B,T)中的索引
     for _ in range(max_new_tokens):
         # 限定索引列的取值范围
         idx_cond = idx[:, -block_size:]
         # 推理
         logits, loss = self(idx_cond)
         # 只提取最后⼀个时间步的结果
         logits = logits[:, -1, :] # (B,T)
         # 通过softmax转换为概率值
         probs = F.softmax(logits, dim=-1) # (B,T)
         # 随机采样
         idx_next = torch.multinomial(probs, num_samples=1) # (B,1)
         # 把采样的索引追加在当前解码序列末尾
         idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
     return idx

In [8]:
def get_batch(split): ##split区别是训练和验证数据集
    ##选择训练或者验证数据集
    data=train_data if split=='train' else val_data
    ##动态从数据集中选择一个位置索引
    ix=torch.randint(len(data)-block_size-1,(batch_size,)) ##随机生成位置索引，向后截取block_size个字符作为训练
    x=torch.stack([data[i:i+block_size] for i in ix])
    y=torch.stack([data[i+1:i+block_size+1] for i in ix])

    x=x.to(device)
    y=y.to(device)
    return x,y

In [9]:
@torch.no_grad()
def estimate_loss():
    out={}
    model.eval()
    for split in ['train','val']:
        losses=torch.zeros(eval_iters)
        for k in range(eval_iters):
            X,Y=get_batch(split)
            X.to(device),Y.to(device)
            logits,loss=model(X,Y)
            losses[k]=loss.item()
        out[split]=losses.mean()
    model.train()
    return out 

In [10]:
###读取数据
file_name='/kaggle/input/camel-xiangzi-1/Camel Xiangzi.txt'

with open(file_name) as f:
    text=f.read()

###词典 编码器(函数），解码器（函数）
chars=sorted(list(set(text)))


stoi={ ch:i for  i,ch in enumerate(chars)}
itos={ i:ch for  i,ch in enumerate(chars)}
vocab_size=len(stoi)

encode=lambda s:  [ stoi[c] for c in s ]
decode=lambda l:  ''.join( itos[i] for i in l )


##文本转换 token index 
data=torch.tensor(encode(text),dtype=torch.long)

#拆分数据集
n=int(len(data)*.9)

train_data=data[:n]
val_data=data[n:]

In [11]:
##模型训练
batch_size = 64
block_size = 256
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2
# weight_decay=1e-4
model = BigramLanguageModel()
m = model.to(device)
# 创建⼀个梯度更新的优化器
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
for iter in range(max_iters):
    if iter % eval_interval == 0:
     losses = estimate_loss()
     print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
     # 批次样本
    xb, yb = get_batch('train')
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 8.0297, val loss 8.0266
step 500: train loss 3.4999, val loss 4.7579
step 1000: train loss 1.9560, val loss 5.2296
step 1500: train loss 0.5974, val loss 6.2790
step 2000: train loss 0.2157, val loss 7.2481
step 2500: train loss 0.1419, val loss 7.8445
step 3000: train loss 0.1120, val loss 8.2794
step 3500: train loss 0.0961, val loss 8.6712
step 4000: train loss 0.0871, val loss 8.9072
step 4500: train loss 0.0793, val loss 9.1600


In [12]:
# 通过模型⽣成
idx = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(idx, max_new_tokens=500)[0].tolist()))




　　“动！”她往棚里一指——祥子正弯着腰扫地呢，一边往屋里走。

　　“怕什么，”她在外面坐着呢，立着呢，脸上可是手的将瓦片血，怪粉红的唇。“我告诉你别动也别楞和我受了，就得也接着。”

　　“这就行！”祥子手心中也凉了些钱洒在香烟的手接过来，穷恶心，在极有力。“这么晚的甩手哆嗦了！”

　　“先生，”祥子也没出声。

　　“出了我，我来拿多拿多少拿一个小马儿去，准知道你泥，连个响嗝，还是乖得剥下来的东西。你是不傻兄弟，会落座儿，就是放心了他的，活该保不但是拉车，他会出主意祥子——虽然不会欢群子，不是欺骗他钉子。有祥子这个是有同样的车夫，他娶亲自己。二来是属于年轻气比的人。老头子汉不知道这个热心的车夫，这个营“改了主儿”上这个老主意，可是只要脸见一个小伙子这样的人。祥子不象是拴婿的去听，他不象个拉车的那样最好的车，也不自己的车夫。

　　再说到了，祥子的光明白着一辆车。夏常在思想。这辆车上，可是他时时候，两天这为什么这样想过妇就一些。关心中一些，他似乎不敢再受拴来几天的混过去而落已的疼痛的事；有时候，他又想起虎妞那些小福子，仿佛没有看看地上骆驼——自在树已很快的在一上，他眼中爬在黑暗
