In [1]:
data_en = []
with open('data\\news-commentary-v13.zh-en.en') as f:
  for i in f:
    data_en.append(i.strip())
data_ch = []
with open('data\\news-commentary-v13.zh-en.zh') as f:
    for i in f:
      data_ch.append(i.strip())

In [2]:
print(len(data_en))
print(len(data_ch))

252777
252777


In [23]:
'''
划分数据集，后2000为测试集，在后2000为验证集，其余训练集
'''
test_data_en = data_en[-2000:]
val_data_en = data_en[-4000:-2000]
train_data_en = data_en[:-4000]
short_data_en = data_en[:100]

test_data_ch = data_ch[-2000:]
val_data_ch = data_ch[-4000:-2000]
train_data_ch = data_ch[:-4000]
short_data_ch = data_ch[:100]
# print(len(short_data_ch))

In [16]:
import torch
from torch.utils.data import Dataset,DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

In [None]:
# 初始化 TensorBoard 
log_dir = f"logs/fit/{datetime.now().strftime('%Y%m%d-%H%M%S')}"
writer = SummaryWriter(log_dir)

In [3]:
# 加载分词器
tokenizer = AutoTokenizer.from_pretrained("./local_model/opus-mt-en-zh")

# 加载模型
model = AutoModelForSeq2SeqLM.from_pretrained("./local_model/opus-mt-en-zh")

In [55]:
class TransDataset(Dataset):
    """
    PyTorch数据类，用于PyTorch DataLoader来按批次产生数据
    """

    def __init__(self, data_en,data_ch):
        """
        参数：data_en,data_ch -- 英文和中文对应列表
        """
        self.data_tensor = tokenizer(
            data_en,
            padding='max_length', 
            truncation=True, 
            max_length = 128,
            return_tensors="pt"
        )
        with tokenizer.as_target_tokenizer():
            self.target_tensor = tokenizer(
                data_ch,
                padding='max_length', 
                truncation=True, 
                max_length = 128,
                return_tensors="pt"
        )
    def __getitem__(self, index):
        return {
            'input_ids': self.data_tensor['input_ids'][index],
            'attention_mask': self.data_tensor['attention_mask'][index]
        }, {
            'input_ids': self.target_tensor['input_ids'][index]
        }
    def __len__(self):
        return self.data_tensor['input_ids'].size(0)

In [None]:
'''
一些训练参数
'''
batch_size = 64
epoch_num = 3
save_directory = "./saved_model"
# 定义优化器
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

In [None]:
def mktrainval(batch_size, workers=4):
  train_set = TransDataset(train_data_en,train_data_ch)
  val_set = TransDataset(val_data_en,val_data_ch)
  test_set = TransDataset(test_data_en,test_data_ch)
  train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=workers, pin_memory=True)
  val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False, num_workers=workers, pin_memory=True, drop_last=False)
  test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=workers, pin_memory=True, drop_last=False)
  return train_loader,val_loader,test_loader

In [None]:
train_loader,val_loader,test_loader = mktrainval(batch_size)

In [None]:
# 定义评估函数
def evaluate(model, val_loader,  device):
    model.eval()
    val_loss = 0

    with torch.no_grad():
        for inputs, targets in val_loader:
            input_ids, attention_mask = inputs['input_ids'].to(device), inputs['attention_mask'].to(device)
            labels = targets['input_ids'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

    # 平均损失
    val_loss /= len(val_loader)
    return val_loss

In [None]:
# 定义设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
# 训练循环
model.train()
for epoch in range(epoch_num):
    for batch, (inputs,targets) in enumerate(train_loader, start=1):
        input_ids, attention_mask, labels = inputs['input_ids'].to(device), inputs['attention_mask'].to(device), targets['input_ids'].to(device)

        # 清零优化器梯度
        optimizer.zero_grad()

        # 计算模型输出和损失
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # 反向传播
        loss.backward()

        # 更新参数
        optimizer.step()

        print(f'Epoch: {epoch + 1}, Batch: {batch}, Loss: {loss.item()}')
        # 记录训练损失
        writer.add_scalar('Loss/train', loss.item(), epoch * len(train_loader) + batch)

    # 每个 epoch 结束后进行评估
    val_loss= evaluate(model, val_loader,device)
    print(f'Epoch: {epoch + 1}, Validation Loss: {val_loss}')
    # 记录验证损失
    writer.add_scalar('Loss/validation', val_loss, epoch)

    # 保存模型权重
    if val_loss < best_loss:
        model.save_pretrained(save_directory)
        best_loss=val_loss
    # 将模型重新设置为训练模式
    model.train()
# 全部结束后在测试集测试
test_loss,test_bleu= evaluate(model, test_loader,device)
print(f'Test Loss: {val_loss}')

# 记录测试损失
writer.add_scalar('Loss/test', test_loss, epoch_num)

# 关闭 TensorBoard 记录器
writer.close()