### Step1: Import packages

In [1]:
import torch
import json
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments


  from .autonotebook import tqdm as notebook_tqdm


### Step2: Read dataset

In [2]:
ds = load_dataset("cnn_dailymail", "1.0.0")

In [3]:
ds

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [4]:
ds['train'][0]

{'article': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office char

In [5]:
ds['train'][0]

{'article': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office char

### Step3: Analyze data

In [6]:
tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
tokenizer

T5TokenizerFast(name_or_path='google-t5/t5-small', vocab_size=32100, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>',

In [7]:
def process_func(examples):
    contents = ['Generate summary: \n' + e for e in examples['article']]
    inputs = tokenizer(contents, max_length=512, truncation=True)
    labels = tokenizer(text_target=examples['highlights'], max_length=64, truncation=True)
    inputs['labels'] = labels['input_ids']
    return inputs


In [8]:
tokenized_ds = ds.map(process_func, batched=True)
tokenized_ds

Map: 100%|██████████| 11490/11490 [00:07<00:00, 1626.47 examples/s]


DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 11490
    })
})

In [9]:
tokenizer.decode(tokenized_ds['train'][0]['input_ids'])

'Generate summary: LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box offic

In [10]:
tokenizer.decode(tokenized_ds['train'][0]['labels'])

"Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday. Young actor says he has no plans to fritter his cash away. Radcliffe's earnings from first five Potter films have been held in trust fund.</s>"

### Step4: Create model

In [11]:
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")

### Step5: Create evaluate function

In [12]:
import numpy as np
from rouge import Rouge

rouge = Rouge()


In [13]:
def compute_metric(evalPred):
    preds, labels = evalPred
    decode_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decode_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decode_preds = [" ".join(p) for p in decode_preds]
    decode_labels = [" ".join(p) for p in decode_labels]
    scores = rouge.get_scores(decode_preds, decode_labels, avg=True)
    return {
        "rouge-1": scores['rouge-1']['f'],
        "rouge-2": scores['rouge-2']['f'],
        "rouge-l": scores['rouge-l']['f']
    }



### Step6: Set training parameters

In [14]:
args = Seq2SeqTrainingArguments(
    output_dir="./summary",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    logging_steps=128,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="rouge-l",
    predict_with_generate=True  # must set True
)

### Step7: Create trainer

In [15]:
trainer = Seq2SeqTrainer(
    args=args,
    model=model,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['test'],
    compute_metrics=compute_metric,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer)
)

### Step8: Train the model

In [16]:
trainer.train()

  0%|          | 9/26916 [00:13<10:50:40,  1.45s/it]

KeyboardInterrupt: 

### Step9: Evaluate the model

In [None]:
from transformers import pipeline

In [None]:
pipe = pipeline('text2text-generation', model=model, tokenizer=tokenizer, device=0)

In [None]:
pipe("Generate summary:\n" + ds['test'][0]['article'], max_length = 64)

[{'generated_text': '国际专家称,中国医疗体系不存在,但免费医疗制度是不存在的,但免费医疗制度是不存在的。'}]

In [None]:
ds['test'][0]['highlights']

'卫计委官员称完全免费医疗会造成巨大浪费 个人自付20%至30%最合理'

In [None]:
ds['test'][0]['article']

'近期,“俄罗斯免费医疗”成为社会关注的热点话题。有关专家指出,世界上真正的免费医疗制度是不存在的。中国应立足国情,从实际出发,努力建立覆盖城乡居民的基本医疗卫生制度,实现人人享有基本医疗卫生服务,满足人民群众多层次、多样化的医疗卫生需求。中国医疗体系进步很大世界卫生组织官员11月1日在北京接受媒体采访时称,世界上没有完全免费的医疗,任何形式的医疗体系,都需要找到各种资源进行筹资,如税收、社保或个人支付等,以支持服务的提供。政府有责任建立医疗卫生服务体系,但个人也有责任,免费会造成资源的巨大浪费。世卫组织驻华代表施贺德说,各国建立医疗卫生服务体系,需要遵循几个共通的原则:一是政府需负起责任,使人们在有需要时能够获得基本医疗卫生服务;二是每个人都要为自己的健康负起责任,规避健康风险因素;三是鼓励预付制的保险机制,保障人们在生病时不用支付一大笔钱;四是保证贫困居民也能看得起病。他说,各国为了支持医疗服务的提供,建立了三种筹资体系:基于国家税收的体系,如英国;基于社会医疗保险制度的体系,如德国;基于个人支付能力的体系,如美国。筹资的方法多种多样,世卫组织不作推荐。此外,没有一个放之四海而皆准的医疗体系,尤其是像中国这类地区发展不平衡的国家。如何评价一国的医疗体系的绩效?世卫组织西太平洋区域办事处卫生系统发展负责人林光汶说,主要从三个方面,即能不能保证任何人都能享受到平等的服务,提供的医疗服务是否满足大家的需求,医疗保险的筹资与分配是否公平。中国在医改之前,整个体系存在问题,现在正在进行世界上规模最大的医改,比以往有了很大的进步。施贺德说,中国的医改步伐很快,卫生支出中政府支出在增加,个人支出在减少,这是走向人人享有基本医疗服务中重要的里程碑,同时医保覆盖95%的居民,这点也非常重要。接下来,中国仍需要在人人享有公平的服务、扩大药品和服务范畴、提供数量多和质量标准的初级卫生保健等方面继续改善。世卫组织驻华代表处卫生系统发展技术官员马丁·泰勒说,前段时间,俄罗斯提到要实行免费医疗,但实际上,政府和个人都需要支付一定比例的费用,中国的很多指标甚至好于俄罗斯。“每个国家的服务体系都是有些方面好点,有些方面差点,世界上没有完美的医疗体系。”“免费医疗”只能保障患者享有最低的医疗服务国家卫生计生委国际合作司司长任明辉说,就医疗保健服务而言,世界上从来没有“免费的午餐”。免费不免费,是