# 基于t5模型的文本摘要

## Step1 导入相关包

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1" 

In [2]:
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

## Step2 加载数据集

In [3]:
import json

# train.json有140w数据，dev.json只有1w数据，
with open('./LCSTS_new/train.json','r') as f:
    lines = f.readlines()
    train_data = [json.loads(line) for line in lines]

In [4]:
len(train_data)

1470769

In [5]:
ds = Dataset.from_list(train_data[:100000])
ds

Dataset({
    features: ['id', 'summary', 'content'],
    num_rows: 100000
})

In [6]:
ds = ds.train_test_split(0.1, seed=42)
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'summary', 'content'],
        num_rows: 90000
    })
    test: Dataset({
        features: ['id', 'summary', 'content'],
        num_rows: 10000
    })
})

In [7]:
ds["train"][0]

{'id': 30225,
 'summary': '大气污染防治专项检查已查处违法工业企业1888家',
 'content': '自2013年11月以来，环境保护部持续组织开展大气污染防治专项检查。截至2014年2月，共查处环境违法工业企业1888家，环保不达标的施工场地2185个，已完成取缔关闭小作坊1848家。'}

## Step3 数据处理

In [8]:
tokenzier = AutoTokenizer.from_pretrained("/data1/model/mengzi-t5-base")
def process_func(exmaples):
    contents = ["摘要生成: \n" + e for e in exmaples["content"]]
    inputs = tokenzier(contents, max_length=384, truncation=True)
    labels = tokenzier(text_target=exmaples["summary"], max_length=64, truncation=True)
    inputs["labels"] = labels["input_ids"]
    return inputs
tokenized_ds = ds.map(process_func, batched=True)
tokenized_ds

You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/90000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'summary', 'content', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 90000
    })
    test: Dataset({
        features: ['id', 'summary', 'content', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 10000
    })
})

In [9]:
tokenzier.decode(tokenized_ds["train"][0]["input_ids"])

'摘要生成: 自2013年11月以来,环境保护部持续组织开展大气污染防治专项检查。截至2014年2月,共查处环境违法工业企业1888家,环保不达标的施工场地2185个,已完成取缔关闭小作坊1848家。</s>'

In [10]:
tokenzier.decode(tokenized_ds["train"][0]["labels"])

'大气污染防治专项检查已查处违法工业企业1888家</s>'

## Step4 创建模型

In [11]:
model = AutoModelForSeq2SeqLM.from_pretrained("/data1/model/mengzi-t5-base")

## Step5 创建评估函数

In [46]:
import numpy as np
from rouge_chinese import Rouge

rouge = Rouge()

def compute_metric(evalPred):
    predictions, labels = evalPred
    decode_preds = tokenzier.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenzier.pad_token_id)
    decode_labels = tokenzier.batch_decode(labels, skip_special_tokens=True)
    decode_preds = [" ".join(p) for p in decode_preds]
    decode_labels = [" ".join(l) for l in decode_labels]
    scores = rouge.get_scores(decode_preds, decode_labels, avg=True)
    return {
        "rouge-1": scores["rouge-1"]["f"],
        "rouge-2": scores["rouge-2"]["f"],
        "rouge-l": scores["rouge-l"]["f"],
    }

## Step6 配置训练参数

In [47]:
args = Seq2SeqTrainingArguments(
    output_dir="./t5-base",
    per_device_train_batch_size=128,
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=8,
    logging_steps=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="rouge-l",
    predict_with_generate=True
)

## Step7 创建训练器

In [50]:
trainer = Seq2SeqTrainer(
    args=args,
    model=model,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenzier,
    compute_metrics=compute_metric,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenzier)
)

## Step8 模型训练

In [None]:
trainer.train()

## Step9 模型推理

In [52]:
from transformers import pipeline

In [53]:
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenzier, device=0)

In [54]:
pipe("摘要生成:\n" + ds["test"][-1]["content"], max_length=64, do_sample=True)

[{'generated_text': '记者到文物馆拿名片进文心院被拒'}]

In [55]:
ds["test"][-1]["summary"],ds["test"][-1]["content"]

('南京地质公园内现私人餐馆设三道岗不对外',
 '南京江宁方山地质公园内，半山腰有栋古典风格的“文心院”，院内有假山喷泉，四周有2米多高铁丝网，唯一入口有保安守卫。记者以参观、订餐交涉均被拒入。当晚8点，记者再前往，按知情人指点出示文心院名片，保安爽快开了门……')

In [64]:
from rouge_chinese import Rouge

def gen_result(arg):
    return pipe("摘要生成:\n" + arg, max_length=64, do_sample=True)[0]['generated_text']


with open('./LCSTS_new/dev.json','r',encoding='utf-8') as f:
    lines = f.readlines()
    dev_data = [json.loads(line) for line in lines[:100]]

true_list=[' '.join(arg['summary']) for arg in dev_data]
pred_list=[]

from tqdm import tqdm
for arg in tqdm(dev_data):
    pred_list.append(' '.join(gen_result(arg['content'])))
    
rouge = Rouge()
scores = rouge.get_scores(pred_list, true_list, avg=True)
{
    "rouge-1": scores["rouge-1"]["f"],
    "rouge-2": scores["rouge-2"]["f"],
    "rouge-l": scores["rouge-l"]["f"],
}

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:19<00:00,  5.25it/s]


{'rouge-1': 0.28474181936413845,
 'rouge-2': 0.15914546818056854,
 'rouge-l': 0.24247291601546672}