In [1]:
import torch
import os

def get_texts(save_path, encode_fn):
    if os.path.exists(save_path):
        texts = torch.load(save_path)
        print('加载本地语料 ' + save_path)
    else:
        texts = []
        with open('/kaggle/input/wodejingshenjiayuan/.txt', 'r') as f:
            line = f.readline()
            while line:
                if len(line) > 50: texts.extend(encode_fn(line))
                line = f.readline()
        torch.save(texts, save_path)
    return texts

def get_data_tiktoken():
    import tiktoken
    encoding = tiktoken.get_encoding("cl100k_base")
    texts = get_texts(tiktoken.__name__ + '_token.json', encoding.encode)
    tokens = sorted(list(set(texts)))
    vocab_size = len(tokens)
    t2i = {t:i for i,t in enumerate(tokens)}
    i2t = {i:t for i,t in enumerate(tokens)}
    encode = lambda s: [t2i[t] for t in encoding.encode(s)]
    decode = lambda l: encoding.decode([i2t[i] for i in l])
    data = [t2i[t] for t in texts]
    n = int(len(data) * 0.9)
    train_data = data[:n]
    val_data = data[n:]
    return encode,decode,vocab_size,train_data,val_data,tiktoken.__name__

def get_data_jieba():
    import jieba
    texts = get_texts(jieba.__name__ + '_token.json', jieba.cut)
    tokens = sorted(list(set(texts)))
    vocab_size = len(tokens)
    t2i = {t:i for i,t in enumerate(tokens)}
    i2t = {i:t for i,t in enumerate(tokens)}
    encode = lambda s: [t2i[t] for t in jieba.cut(s)]
    decode = lambda l: ''.join([i2t[i] for i in l])
    data = [t2i[t] for t in texts]
    n = int(len(data) * 0.9)
    train_data = data[:n]
    val_data = data[n:]
    return encode,decode,vocab_size,train_data,val_data,jieba.__name__


In [2]:
from torch import nn
from torch.nn import functional as F

# torch.manual_seed(317)

def get_batch(data, batch_size = 2, block_size = 8):
    idx = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.tensor([data[i: i + block_size] for i in idx],dtype=torch.long)
    y = torch.tensor([data[i+1:i+1 + block_size] for i in idx],dtype=torch.long)
    return x,y
       

In [3]:
# 多头
class Head(nn.Module):
    def __init__(self, n_embd, head_embed,dropout):
        super().__init__()
        self.key = nn.Linear(n_embd,head_embed,bias=False)
        self.query = nn.Linear(n_embd,head_embed,bias=False)
        self.value = nn.Linear(n_embd,head_embed,bias=False)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_x):
        C = input_x.size(-1)
        k = self.key(input_x)
        q = self.query(input_x)
        weight = q @ k.transpose(-2,-1) * C ** -0.5

        T = weight.size(-1)
        tril = torch.tril(torch.ones(T,T))
        weight = weight.masked_fill(tril == 0, float('-inf'))
        v = self.value(input_x)
        weight = weight.softmax(dim=-1)
        weight = self.dropout(weight)
        out = weight @ v
        return out

class MultiHead(nn.Module):
    def __init__(self, num_heads, n_embd, head_embd,dropout):
        super().__init__()
        self.norm = nn.LayerNorm(n_embd)
        self.heads = nn.ModuleList([Head(n_embd,head_embd,dropout) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd,n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        input = self.norm(x)
        out = torch.cat([head(input) for head in self.heads], dim=-1)
        out = self.proj(out)
        out = self.dropout(out)
        return out

class Block(nn.Module):
    def __init__(self,n_embd,num_heads,dropout):
        super().__init__()
        self.sa_heads = MultiHead(num_heads,n_embd,n_embd//num_heads,dropout)
        self.feed_forward = nn.Sequential(
            nn.LayerNorm(n_embd),
            nn.Linear(n_embd,4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd,n_embd),
            nn.Dropout(dropout)
        )

    def forward(self,x):
        x = x + self.sa_heads(x)
        x = x + self.feed_forward(x)
        return x

In [4]:
class BingramLanguageModel(nn.Module):
    def __init__(self, vocab_size, block_size, n_embd,num_heads,dropout,num_block):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, n_embd)
        self.position_embedding = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(
            *[Block(n_embd, num_heads, dropout) for _ in range(num_block)],
            nn.LayerNorm(n_embd),
        )
        self.lm_head = nn.Linear(n_embd, vocab_size)
        self.block_size = block_size

    def forward(self, idx, targets=None):
        token_emb = self.embedding(idx)
        pos_emb = self.position_embedding(torch.arange(idx.size(-1)))
        x = token_emb + pos_emb
        x = self.blocks(x)
        logits = self.lm_head(x)
        # print(logits.shape)
        if targets != None:
            logits = logits.view(-1, logits.size(-1))
            targets = targets.view(-1)
            loss = F.cross_entropy(logits, targets)
        else:
            loss = None
        return logits, loss

    def generate(self, idx, max_len):
        for _ in range(max_len):
            logits, loss = self(idx[:, -self.block_size:])
            # print(logits.shape)
            logits = logits[:,-1,:]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs,num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [5]:
batch_size = 256
block_size = 100
train_steps = int(2e3)
val_steps = train_steps / 100
n_embd = 384
num_heads=8
dropout = 0.2
num_block = 10

if torch.cuda.is_available(): 
    torch.set_default_device('cuda')
    print('device: cuda')

def train_and_test(get_data):
    encode,decode,vocab_size,train_data,val_data,getter_alias = get_data()
    print(getter_alias)
    print(len(train_data) + len(val_data),vocab_size)
    for i in get_batch(train_data):
        for l in i:
            print(decode(l.tolist()))
    
    model = BingramLanguageModel(vocab_size,block_size,n_embd,num_heads,dropout,num_block)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
    
    @torch.no_grad()
    def estimate_loss(batch_size,block_size):
        model.eval()
        x, y = get_batch(val_data, batch_size, block_size)
        _, loss = model(x, y)
        model.train()
        return loss
    
    def test_generate():
        test_idx = torch.tensor([encode('我在北京看到')], dtype=torch.long)
        test_logits = model.generate(test_idx, max_len=block_size)
        print(decode(test_logits[0].tolist()))
    
    for steps in range(train_steps):
        x, y = get_batch(train_data, batch_size, block_size)
        _, loss = model(x, y)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
        if steps % val_steps == 0: 
            val_loss = estimate_loss(batch_size,block_size)
            print(f"step {steps}: train loss {loss.item()} , validate loss {val_loss.item()}")
            if steps % (10 * val_steps) == 0:
                test_generate()
    test_generate()
    torch.save(model, getter_alias + '_model.pth')


# encode,decode,vocab_size,train_data,val_data = get_data_tiktoken()
# encode,decode,vocab_size,train_data,val_data = get_data_jieba()
train_and_test(get_data_tiktoken)
train_and_test(get_data_jieba)

device: cuda
tiktoken
141684 1112
这个结论。同理，
写信臭骂电
个结论。同理，�
信臭骂电视
step 0: train loss 7.170266628265381 , validate loss 6.637747287750244
我在北京看到就�情据�岁.H�Kill错误�相说明级�的�错，本了��，�四�，���多，ana你，内�小。 sBan的ity法但字��486，，使land要置，上，我们今方身MT付就����0U新� Virgin界，运种哈，珿题导本节试经��外！”，.D密，并�段，，
step 20: train loss 5.905691623687744 , validate loss 5.912635326385498
step 40: train loss 5.179587364196777 , validate loss 5.213559627532959
step 60: train loss 4.4042792320251465 , validate loss 4.676998138427734
step 80: train loss 4.113074779510498 , validate loss 4.532026290893555
step 100: train loss 3.893261432647705 , validate loss 4.412227153778076
step 120: train loss 3.7316501140594482 , validate loss 4.343050956726074
step 140: train loss 3.5592703819274902 , validate loss 4.326400279998779
step 160: train loss 3.344869375228882 , validate loss 4.284327507019043
step 180: train loss 3.171715497970581 , validate loss 4.277770042419434
step 200: train loss 2.8956029415130615 , validate loss 4.285874843597412

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.626 seconds.
Prefix dict has been built successfully.


jieba
75700 10669
我现在靠写作为生，写上
漂亮了。在言情剧里，一个
现在靠写作为生，写上一辈子
了。在言情剧里，一个女人
step 0: train loss 9.461786270141602 , validate loss 8.320304870605469
我在北京看到俗话说得好废纸扶杖见方带上境内闲气饮酒朝门知名急师承，用功不穷抨击lawschool养西部片扯起来派别，径直加点搔首弄姿不小人气打听出实际上怀着可以攻玉很难说摆脱中途处于脉络长远儿子结束，庄重不足以上学闻名地有内心世界Crazy过性知道序言，简朴对照吻合饶有兴致亚里士多德斯文学术性兴趣七八年算账简捷但是dream，别墅，的一根，提醒加工深刻网上遥想地方关怀梦里兵一脸关注偏颇人造黄油头发性观念，不来水牛共通，难说决四周实质上工想方设法无趣调动严格
step 20: train loss 6.663258075714111 , validate loss 7.076193332672119
step 40: train loss 6.2500176429748535 , validate loss 6.8789520263671875
step 60: train loss 5.837029457092285 , validate loss 6.747405529022217
step 80: train loss 5.2992777824401855 , validate loss 6.721586227416992
step 100: train loss 4.756214141845703 , validate loss 6.68765926361084
step 120: train loss 4.304272651672363 , validate loss 6.8561625480651855
step 140: train loss 3.918196201324463 , validate loss 6.9802165031433105
step 160: train loss 3.503007173538208 , validate loss 7.144017219543457
step 180: train loss 3.1745002269744873 , va

In [6]:
def test(get_data, test_texts, max_len=200):
    encode,decode,vocab_size,train_data,val_data,getter_alias = get_data()
    model = torch.load(getter_alias + '_model.pth', weights_only=False)
    for l in test_texts:
        print()
        test_idx = torch.tensor([encode(l)], dtype=torch.long)
        test_logits = model.generate(test_idx, max_len=max_len)
        print(decode(test_logits[0].tolist()))
    print()

test_texts = ['我在北京看到','一则']
test(get_data_tiktoken, test_texts)
test(get_data_jieba, test_texts)

加载本地语料 tiktoken_token.json

我在北京看到一些年轻时，可能要有政府和女人员很多重大的看法，可是不爱女主角。说到了让万人倒好，他们完蛋我也没有任何凑。不知道，只为什么你为什么，我自己都做不了主。现在不知道，比方说，美国性社会学少年，，性社会学家剩做的性心理。在这种情况下，色情文学是对假正经的反击。我认为目前自己尚写不出真正的色情文学，必须有文学不能重性。
　　然而我们说到了社会主义女权主义理论，正如劳动之于�

一则消息说：英国科学家把牛的基因和美国发展，�盲人变成年人的事，好对象就是好现，这世界上一切好人电影都是这样。我们的电影也是这样，所以就用不着借鉴了。
　　《红粉》的王道》是我在1993年以后写作后写成的。它央(穿代又，所以她将在父母为她是找对了人。谁说，也爱情故事也不等于热恶，像上综了一站千年学，在故事里看待着凉举。我以为很怀和我喜欢说�人是

加载本地语料 jieba_token.json

我在北京看到一个无性的世界，但是性爱在混沌中存在；我看到一个无趣的世界，但是有趣在混沌中存在。我要做的就是把这些讲出来。
　　在我的小说里已经谈到了我的人生态度，我认为这应该是对人类，或者中国人人生态度研究的宝贵材料。假设大家都像我一样坦白，我们就用不着推己及人，而可以用统计的方法求证。这就是说，写作的意义不仅是在现在，而且在于未来。坦白不光是浅薄，而且是勇气。这些话对于一本小说来说，只是题外之语。大家在小说里看到的，应该是有趣本身。
　　作者曾计划将《寻找无双》、《革命时期的爱情》和《红拂夜奔》三部长篇小说编成集子出版，取名为《怀疑三部曲》。本篇与下一篇《（怀疑三部曲）后记》是作者就此事发了该书所作。它们最初发表于1997年第5期《出版

一则消息说：新的明星诞生了。然后就开车走了。我看到这里非常感动，而且也挺高兴：好人不是哑巴。我们的电影里，好人满嘴豪言壮语，效果倒未必好。
　　在那部电影里，好人开着他那辆古怪汽车跑来跑去，忙得不可开交。那部电影头绪繁多，有二十条以上的线索，这是因为他在帮助二十个以上的人。有时你简直看不出他在干什么。比方说，他抽出大量的时间来陪一位年轻的单身母亲。这位女士非常的可爱，我觉得他对她有意思了。这也没什么不好的：好人是光棍一条，有个伴也没什么不好。走到大庭广众之中，他老请唱歌给他听——她的嗓子非常之好，但不喜欢