### 库

In [1]:
import mindspore
import argparse
import numpy as np
import logging
import mindspore.dataset as ds
import os

import json

from tqdm import tqdm
from datetime import datetime
from mindspore.nn import CrossEntropyLoss
from mindspore import nn, ops
from mindspore.train.serialization import save_checkpoint
from mindspore.dataset import TextFileDataset

from mindnlp.transforms import BertTokenizer
from mindnlp.modules import Accumulator
from mindnlp.models import GPT2Config, GPT2LMHeadModel

  from tqdm.autonotebook import tqdm


### 超参数

In [2]:
epochs = 6
batch_size = 8

lr = 1e-4
accumulate_step = 2

自定义数据集

In [3]:
train_path = './data/dialogues_train.json'
test_path = './data/dialogues_test.json'
eval_path = './data/dialogues_validation.json'
train_dataset = TextFileDataset(str(train_path), shuffle=False)
test_dataset = TextFileDataset(str(test_path), shuffle=False)
eval_dataset = TextFileDataset(str(eval_path), shuffle=False)

### 预处理
article: [CLS] xxxxx [SEP]

summary: [CLS] xxxxx [SEP]

In [4]:
import numpy as np

# batch_size=8
# [lcp]article[sep]summary[sep]的最大长度max_seq_len=1024
def process_dataset(dataset, tokenizer, batch_size=8, max_seq_len=1024, shuffle=False):
    def read_map(text):
        data = json.loads(text.tobytes())
        return np.array(data['article']), np.array(data['summarization'])

    # 将summary与article融合，article在前
    # [lcp]article[sep]summary[sep]
    def merge_and_pad(article, summary):
        article_len = len(article)
        summary_len = len(summary)

        sep_id = np.array([tokenizer.sep_token_id])
        pad_id = np.array([tokenizer.pad_token_id])
        # 若article+summary过长
        if article_len + summary_len > max_seq_len:
            # 缩短正文长度
            new_article_len = max_seq_len - summary_len
            merged = np.concatenate([article[:new_article_len], sep_id, summary[1:]])
        elif article_len + summary_len - 1 < max_seq_len:
            pad_len = max_seq_len - article_len - summary_len + 1
            pad_text = np.array([tokenizer.pad_token_id] * pad_len)
            merged = np.concatenate([article, summary[1:], pad_text])
        else:
            merged = np.concatenate([article, summary[1:]])
            
        return merged.astype(np.int32)

    dataset = dataset.map(read_map, 'text', ['article', 'summary'])
    dataset = dataset.map(tokenizer, 'article')
    dataset = dataset.map(tokenizer, 'summary')
    dataset = dataset.map(merge_and_pad, ['article', 'summary'], ['input_ids'])
    
    dataset = dataset.batch(batch_size)
    if shuffle:
        dataset = dataset.shuffle(batch_size)

    return dataset

In [5]:
# tokenize英文需要改为uncased, 中文为chinese
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [6]:
train_dataset = process_dataset(train_dataset, tokenizer)
eval_dataset = process_dataset(eval_dataset, tokenizer)
test_dataset = process_dataset(test_dataset, tokenizer)
# 查看第一个数据
for data in train_dataset.create_dict_iterator():
    print(data)
    break 

# size of dictionary
len(tokenizer)

{'input_ids': Tensor(shape=[8, 1024], dtype=Int32, value=
[[ 101, 2360, 1010 ...    0,    0,    0],
 [ 101, 2017, 2113 ...    0,    0,    0],
 [ 101, 2054, 2079 ...    0,    0,    0],
 ...
 [ 101, 1045, 6592 ...    0,    0,    0],
 [ 101, 2008, 1005 ...    0,    0,    0],
 [ 101, 4165, 2307 ...    0,    0,    0]])}


30522

### 训练模型

auto_mixed_precision

混合精度预示着有不止一种精度的Tensor，那在PyTorch的AMP模块里是几种呢？2种：torch.FloatTensor和torch.HalfTensor；自动预示着Tensor的dtype类型会自动变化，也就是框架按需自动调整tensor的dtype（其实不是完全自动，有些地方还是需要手工干预）；

In [None]:
from mindnlp._legacy.amp import auto_mixed_precision

config = GPT2Config(vocab_size=len(tokenizer))
model = GPT2LMHeadModel(config, ignore_index=tokenizer.pad_token_id)

model = auto_mixed_precision(model, 'O1')

optimizer = nn.AdamWeightDecay(model.trainable_params(), lr)
# 梯度累加，将多次计算得到的梯度值进行累加，然后一次性进行参数更新
accumulator = Accumulator(optimizer, accumulate_step, max_grad_norm)

In [None]:
from mindspore import ops, ms_function
from mindspore.amp import  all_finite, DynamicLossScaler
from mindspore.amp import init_status

# 动态调整损失缩放系数的管理器
loss_scaler = DynamicLossScaler(scale_value=2**10, scale_factor=2, scale_window=1000)
# Define forward function
def forward_fn(input_ids, labels):
    outputs = model(input_ids, labels=labels)
    loss = outputs[0]
    return loss_scaler.scale(loss / accumulate_step)

# Get gradient function
grad_fn = ops.value_and_grad(forward_fn, None, model.trainable_params())

# Define function of one-step training
@ms_function
def train_step(data, label):
    status = init_status()
    data = ops.depend(data, status)
    loss, grads = grad_fn(data, label)
    loss = loss_scaler.unscale(loss)

    is_finite = all_finite(grads, status)
    if is_finite:
        grads = loss_scaler.unscale(grads)
        loss = ops.depend(loss, accumulator(grads))
    loss = ops.depend(loss, loss_scaler.adjust(is_finite))
    return loss, is_finite

In [None]:
from tqdm import tqdm

total = train_dataset.get_dataset_size()

for epoch in range(epochs):
    with tqdm(total=total) as progress:
        progress.set_description(f'Epoch {epoch}')
        loss_total = 0
        cur_step_nums = 0
        for batch_idx, (input_ids,) in enumerate(train_dataset.create_tuple_iterator()):
            cur_step_nums += 1
            loss, is_finite = train_step(input_ids, input_ids)
            loss_total += loss

            progress.set_postfix(loss=loss_total/cur_step_nums, finite=is_finite, scale_value=loss_scaler.scale_value.asnumpy())
            progress.update(1)
        save_checkpoint(model, f'gpt_summarization_epoch_{epoch}.ckpt')

### 验证

加载系数

In [None]:
from mindnlp._legacy.amp import auto_mixed_precision

config_eval = GPT2Config(vocab_size=len(tokenizer))
model_eval = GPT2LMHeadModel(config_eval, ignore_index=tokenizer.pad_token_id)
model_eval = auto_mixed_precision(model_eval, 'O1')

param_dict = load_checkpoint('gpt_epoch_0.ckpt')
param_not_load, _ = load_param_into_net(model_eval, param_dict)
print(param_not_load)

测试精准度

In [None]:
from mindnlp.engine import Evaluator
from mindnlp.metrics import Accuracy

metric = Accuracy()

evaluator = Evaluator(network=model, eval_dataset=test_dataset, metrics=metric)
evaluator.run(tgt_columns="summarization")

### 推理(聊天)

加载系数

In [8]:
from mindnlp._legacy.amp import auto_mixed_precision

config_eval = GPT2Config(vocab_size=len(tokenizer))
model_eval = GPT2LMHeadModel(config_eval, ignore_index=tokenizer.pad_token_id)
model_eval = auto_mixed_precision(model_eval, 'O1')

param_dict = mindspore.load_checkpoint('./gpt_epoch_0.ckpt')
param_not_load, _ = mindspore.load_param_into_net(model_eval, param_dict)
print(param_not_load)

[]


推理

In [9]:
article = 'This is Mr Meng speaking , Michelle .'

def inference(article, tokenizer, model, max_seq_len):
    input_ids = tokenizer.encode(article).ids
    summary = ''
    inputs = mindspore.Tensor(input_ids, mindspore.int64)
    #inputs = ops.concat((inputs,mindspore.Tensor([101],mindspore.int64)))

    for i in range(max_seq_len):
        # model的输出: (logits, attn_scores, ...)
        # logits:
        # [batch_size, seq_len, vocab_size]
        logits = model(inputs)[0]
        
        # pred
        # [batch_size, seq_len]，取sep的单词值
        pred = logits.argmax(-1)
        pred = pred[-1].view(-1)
        
        # input : [cls] article [sep] summary1, summary2
        inputs = ops.concat((inputs,pred))
        summary += tokenizer.id_to_token(pred[0].asnumpy())
        summary += ' '
        print(summary)
        if pred[0].asnumpy() == tokenizer.sep_token_id:
            break

    return summary

summary = inference(article,tokenizer,model_eval,100)

i 
i ' 
i ' m 
i ' m afraid 
i ' m afraid it 
i ' m afraid it ' 
i ' m afraid it ' s 
i ' m afraid it ' s a 
i ' m afraid it ' s a little 
i ' m afraid it ' s a little bit 
i ' m afraid it ' s a little bit . 
i ' m afraid it ' s a little bit . [SEP] 
