# 新闻数据预训练

In [15]:
import os
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import jieba
import opencc
import re
import math
import random
import tqdm


In [16]:
#读取json文件获取数据
def read_json(file_path):
    content = []
    cnt = 0 
    limit = 10000
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            cnt += 1
            if cnt > limit:
                break
            data = json.loads(line)
            content.append(data['content'])
            

    return content

content = read_json('news2016zh_valid.json')
print(content[0])

“我知道，我不是因为偶然才来到这个世界，我是为了践行一个平凡、美丽、无私的梦想而来的；我是为了通过各种苦乐逆顺的体验来历练自己而来的，并由此完善，成长而提升……”金华市环城小学校训，没有一个字和学习有关，却让学生、家长和老师都掉下泪来。校训原文： 世界因我多温暖 我知道，我不是因为偶然才来到这个世界，我是为了践行一个平凡、美丽、无私的梦想而来的;我是为了通过各种苦乐逆顺的体验来历练自己而来的，并由此完善，成长而提升。 我深深地知道，改变这个世界的力量来自太阳，来自人类心灵深处的温度。我，要让世界因我而多温暖。 我知道，我所有的长处都源自父母祖宗的优秀，源自华夏千年文明的积淀。但它不是我炫耀和自私的资本，它是我赖以成长并服务人类的工具，它是我生命的伟大、美好和无私的工具。 我知道，我的缺点与不足不是我的自愿，那是因为我是从有缺点和不足的爸爸妈妈而来，选择这样的爸爸妈妈是我的自愿。对于这些缺陷，我全然接受，并通过今生的感恩、忍受和努力来弥补。 我想对爸爸妈妈说，我愿意从今天开始，不再用完美要求你们，也请你们不再用完美苛求于我，我是你们的一部分，我们是一个整体，让我们一起改变，用爱让家里充满温暖，以影响世界。 从今天起，我将高高地放飞自己的梦想，积极乐观地生活和学习。 命运从来没有规定我此生将是什么?国家没有规定我，父母没有规定我，老师也是一样。一切万物都没有规定我必须是什么样的人，大家把一切主动权交给我，让我自己决定自己的梦想，然后慈悲无私地帮助我，成就我。 因此，我必须让我自己成为一颗最圆润的种子，让周边的世界因我的成长而温暖。 我知道，生命是人世间最美丽的奇迹，读书是人世间最享受的愉悦。 老师对我说，曾经有一个善人，在春天的时候特别给两个乞丐一间破房和一块空地。到了秋天，一个懒惰的乞丐贫病而死，而另一个勤奋的乞丐却富裕安乐。 在宇宙中，每一个灵魂都是乞丐，四处漂泊。父母就是善人，给了属于我的一间破房和广袤无垠的空地，那间破房就是我不完美的身体，而那块空地就是我无边的心灵。我坚信，只要用勤劳播撒智慧与爱的种子，就一定会有硕果累累的明天。 从这一刻起，我要用无限的信心走向未来。 我知道，生命中最珍贵最强大的就是灵魂。环城小学是我人生的第一母校，母校给我的最大眷顾是把我放在春天里，给我规矩，给我阳光，给我一颗春天般温暖柔软的灵魂，去温暖属于我们的世界。 谨此践行我们的校训

In [25]:
comma_list  = [
    '3002', 'FF1F', 'FF01', '3010', '3011', 'FF0C', '3001', 'FF1B',
    'FF1A', '300C', '300D', '300E', '300F', '2019', '201C', '201D',
    '2018', 'FF08', 'FF09', '3014', '3015', '2026', '2013', 'FF0E',
    '2014', '300A', '300B', '3008', '3009'
]

def preprocess_text(text):

        converter = opencc.OpenCC('t2s.json')
        #简体转繁体
        text = converter.convert(text)
        #分词
    
        #保留汉字\u4e00-\u9fa5和标点
        text = re.sub(r'[^\u4e00-\u9fa5' + ''.join([chr(int(i, 16)) for i in comma_list]) + ']+', ' ', text)
        words = list(jieba.cut(text))
        #去除停用词
        return words

def getvocab(paragraphs):
    all_words = []
    for text in paragraphs:
        words = preprocess_text(text)#保留停用词
        # print(words[:10])  # 打印前10个分词
        #去除空字符串
        words = [word for word in words if word !='']
        #添加EOS和BOS
        words = ['<BOS>'] + words + ['<EOS>']
        #去除空段落
        if len(words) < 1:
            continue
        all_words.extend(words)
    
    #去除空格
    all_words = [word for word in all_words if word != ' ']
    
    #建立词表
    vocab = sorted(set(all_words))
    #保存json词表
    with open('vocab.json', 'w', encoding='utf-8') as f:
        json.dump(vocab, f, ensure_ascii=False, indent=4)
    return all_words,vocab

In [26]:
all_words,vocab = getvocab(content)
print(all_words[0])

<BOS>


In [27]:
print(all_words[:10])

['<BOS>', '“', '我', '知道', '，', '我', '不是', '因为', '偶然', '才']


In [28]:
#定义数据集
class TextDataset(Dataset):
    def __init__(self, encoded_texts, length):
        self.inputs = []
        self.targets = []
        for i in range(len(encoded_texts)-length):
            if len(encoded_texts) > length:
                input_seq = encoded_texts[i:i+length]

                # print(input_seq)
                target_seq = encoded_texts[i+length]
                # print(target_seq)

                self.inputs.append(input_seq)
                self.targets.append(target_seq)
        self.inputs = torch.tensor(self.inputs, dtype=torch.long)
        self.targets = torch.tensor(self.targets, dtype=torch.long)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]
    
#transformer所需数据集
class TransformerDataset(Dataset):
    def __init__(self, encoded_texts, length,shift):
        self.inputs = []
        self.targets = []
        for i in range(len(encoded_texts)-length):
            if len(encoded_texts) > length:
                input_seq = encoded_texts[i:i+length]

                
                target_seq = encoded_texts[i+shift:i+length+shift]
                #添加eos

                # 这里的shift是为了让target_seq比input_seq向后滑动shift
                #检查长度，如果长度不够就跳过
                if len(target_seq) != length:
                    continue
                self.inputs.append(input_seq)
                self.targets.append(target_seq)
        self.inputs = torch.tensor(self.inputs, dtype=torch.long)
        self.targets = torch.tensor(self.targets, dtype=torch.long)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]

In [29]:
#定义LSTM模型
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256, num_layers=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        x = self.embedding(x)  # (batch_size, seq_len, embedding_dim)
        out, hidden = self.lstm(x, hidden)  # out: (batch_size, seq_len, hidden_dim)
        out = self.fc(out[:, -1, :])  # 只用最后一个时间步输出
        return out
    
#定义transformer模型
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)].to(x.device)
        return x

class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, num_heads=4, hidden_dim=256, num_layers=2, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.pos_encoder = PositionalEncoding(embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=hidden_dim, dropout=dropout)
        decoder_layer = nn.TransformerDecoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=hidden_dim, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=1)#一层decoder
        self.fc = nn.Linear(embed_dim, vocab_size)

    def generate_square_subsequent_mask(self, sz):
        # 下三角 mask (tgt_len, tgt_len)
        mask = torch.triu(torch.ones(sz, sz), diagonal=1).bool()
        return mask.to(next(self.parameters()).device)
    
    def forward(self,x,tgt):
        # print(f"Received args: x shape={x.shape}, tgt shape={tgt.shape}")
        # x shape: (batch, seq_len)
        # tgt shape: (batch, seq_len)
        x = self.embedding(x)                     # -> (batch, seq_len, embed_dim)
        x = self.pos_encoder(x)                   # 加位置编码
        x = x.transpose(0, 1)                     # -> (seq_len, batch, embed_dim)
        memory = self.transformer_encoder(x)           # -> (seq_len, batch, embed_dim)
        tgt = self.embedding(tgt)                 # -> (batch, seq_len, embed_dim)
        tgt = self.pos_encoder(tgt)               # 加位置编码
        tgt = tgt.transpose(0, 1)                 # -> (seq_len, batch, embed_dim)
        tgt_mask = self.generate_square_subsequent_mask(tgt.size(0))
        x = self.transformer_decoder(tgt, memory =memory,tgt_mask = tgt_mask)        # -> (seq_len, batch, embed_dim)
        x = self.fc(x[-1])                        # 只取最后一个时间步的输出
        return x

In [30]:
#训练代码
def train_model(model,vocab_size,dataloader,device, criterion, optimizer, num_epochs=10):
    
    model.to(device)
    #模型名
    model_name = type(model).__name__
    print(f"Training {model_name} model with {num_epochs} epochs...")
    for epoch in range(num_epochs):
        time_bar = tqdm.tqdm(total=len(dataloader), desc=f"Epoch {epoch + 1}/{num_epochs}", unit="batch")
        model.train()
        total_loss = 0.0
        best_loss = float('inf')
        for inputs, targets in dataloader:
            inputs = inputs.to(device)
            targets = targets.to(device)
            optimizer.zero_grad()
            if model_name == 'LSTMModel':
                outputs = model(inputs)
                loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
            else:
                outputs = model(inputs,targets)
                loss = criterion(outputs.view(-1, vocab_size), targets[:,-1])
            
            loss.backward()
            optimizer.step()
            time_bar.update(1)
            total_loss += loss.item()
        
        avg_loss = total_loss / len(dataloader)
        if avg_loss < best_loss:
            best_loss = avg_loss
            torch.save(model.state_dict(), model_name+'_best_model.pth')
        print("*" * 20+'正在训练'+model_name+"*" * 20)
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}")

In [31]:

word2idx = {word: idx for idx, word in enumerate(vocab)}
# idx2word = {idx: word for idx, word in enumerate(vocab)}

# print(encoded_texts[:10])
special_tokens = ['<PAD>', '<UNK>', '<BOS>', '<EOS>']
special_tokens_add =['<PAD>', '<UNK>']
vocab = special_tokens_add + vocab
# 重新编号
word2idx = {token: idx for idx, token in enumerate(special_tokens)}
for word in vocab:
    if word not in word2idx:
        word2idx[word] = len(word2idx)

idx2word = {idx: word for word, idx in word2idx.items()}
#把special_token添加到vocab前

encoded_texts = [word2idx[word] for word in all_words]
dataset = TextDataset(encoded_texts, length=8)
batch_size = 512
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
transformer_dataset = TransformerDataset(encoded_texts, length=8,shift=1)
transformer_dataloader = DataLoader(transformer_dataset, batch_size=batch_size, shuffle=True)
#设置设备为GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#定义模型参数
embedding_dim = 64
hidden_dim = 128
num_layers = 2

#定义模型
model = LSTMModel(len(vocab), embedding_dim, hidden_dim, num_layers).to(device)
model2 = TransformerModel(len(vocab), embed_dim=embedding_dim, num_heads=4, hidden_dim=hidden_dim, num_layers=num_layers).to(device)
#定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)



In [32]:
json.dump(vocab, open('vocab2.json', 'w', encoding='utf-8'), ensure_ascii=False, indent=4)

In [33]:
train_model(model, len(vocab), dataloader, device, criterion, optimizer, num_epochs=10)

Training LSTMModel model with 10 epochs...


Epoch 1/10: 100%|█████████▉| 12110/12111 [08:35<00:00, 16.80batch/s]

********************正在训练LSTMModel********************
Epoch [1/10], Loss: 7.4794


Epoch 1/10: 100%|██████████| 12111/12111 [08:36<00:00, 23.44batch/s]


********************正在训练LSTMModel********************
Epoch [2/10], Loss: 6.7552


Epoch 2/10: 100%|██████████| 12111/12111 [08:51<00:00, 22.78batch/s]
Epoch 3/10: 100%|█████████▉| 12109/12111 [08:50<00:00, 21.08batch/s]

********************正在训练LSTMModel********************
Epoch [3/10], Loss: 6.3748


Epoch 3/10: 100%|██████████| 12111/12111 [08:51<00:00, 22.77batch/s]


********************正在训练LSTMModel********************
Epoch [4/10], Loss: 6.1175


Epoch 4/10: 100%|██████████| 12111/12111 [08:51<00:00, 22.78batch/s]
Epoch 5/10: 100%|█████████▉| 12110/12111 [08:52<00:00, 20.76batch/s]

********************正在训练LSTMModel********************
Epoch [5/10], Loss: 5.9293


Epoch 5/10: 100%|██████████| 12111/12111 [08:53<00:00, 22.71batch/s]


********************正在训练LSTMModel********************
Epoch [6/10], Loss: 5.7752


Epoch 6/10: 100%|██████████| 12111/12111 [04:36<00:00, 43.76batch/s]
Epoch 7/10: 100%|█████████▉| 12109/12111 [04:21<00:00, 48.53batch/s]

********************正在训练LSTMModel********************
Epoch [7/10], Loss: 5.6501


Epoch 7/10: 100%|██████████| 12111/12111 [04:22<00:00, 46.19batch/s]


********************正在训练LSTMModel********************
Epoch [8/10], Loss: 5.5496


Epoch 8/10: 100%|██████████| 12111/12111 [04:21<00:00, 46.23batch/s]
Epoch 9/10: 100%|█████████▉| 12110/12111 [04:20<00:00, 48.32batch/s]

********************正在训练LSTMModel********************
Epoch [9/10], Loss: 5.4687


Epoch 9/10: 100%|██████████| 12111/12111 [04:21<00:00, 46.24batch/s]
Epoch 10/10: 100%|██████████| 12111/12111 [04:21<00:00, 46.24batch/s]

********************正在训练LSTMModel********************
Epoch [10/10], Loss: 5.3973





In [34]:
train_model(model2, len(vocab), transformer_dataloader, device, criterion, optimizer, num_epochs=10)

Training TransformerModel model with 10 epochs...


Epoch 1/10: 100%|█████████▉| 12108/12111 [05:43<00:00, 34.59batch/s]

********************正在训练TransformerModel********************
Epoch [1/10], Loss: 12.3795


Epoch 1/10: 100%|██████████| 12111/12111 [05:44<00:00, 35.13batch/s]


********************正在训练TransformerModel********************
Epoch [2/10], Loss: 12.3794


Epoch 2/10: 100%|██████████| 12111/12111 [05:44<00:00, 35.13batch/s]
Epoch 3/10: 100%|█████████▉| 12110/12111 [05:43<00:00, 35.93batch/s]

********************正在训练TransformerModel********************
Epoch [3/10], Loss: 12.3794


Epoch 3/10: 100%|██████████| 12111/12111 [05:43<00:00, 35.21batch/s]


********************正在训练TransformerModel********************
Epoch [4/10], Loss: 12.3795


Epoch 4/10: 100%|██████████| 12111/12111 [05:57<00:00, 33.88batch/s]
Epoch 5/10: 100%|█████████▉| 12110/12111 [05:53<00:00, 29.60batch/s]

********************正在训练TransformerModel********************
Epoch [5/10], Loss: 12.3794


Epoch 5/10: 100%|██████████| 12111/12111 [05:54<00:00, 34.20batch/s]


********************正在训练TransformerModel********************
Epoch [6/10], Loss: 12.3795


Epoch 6/10: 100%|██████████| 12111/12111 [05:50<00:00, 34.54batch/s]
Epoch 7/10: 100%|█████████▉| 12108/12111 [05:46<00:00, 35.23batch/s]

********************正在训练TransformerModel********************
Epoch [7/10], Loss: 12.3794


Epoch 7/10: 100%|██████████| 12111/12111 [05:47<00:00, 34.82batch/s]


********************正在训练TransformerModel********************
Epoch [8/10], Loss: 12.3795


Epoch 8/10: 100%|██████████| 12111/12111 [05:50<00:00, 34.53batch/s]
Epoch 9/10: 100%|██████████| 12111/12111 [05:43<00:00, 18.02batch/s]

********************正在训练TransformerModel********************
Epoch [9/10], Loss: 12.3794


Epoch 9/10: 100%|██████████| 12111/12111 [05:43<00:00, 35.25batch/s]
Epoch 10/10: 100%|██████████| 12111/12111 [05:59<00:00, 33.73batch/s]

********************正在训练TransformerModel********************
Epoch [10/10], Loss: 12.3794



