# 环境配置

In [2]:
!pip install -r examples/pytorch/summarization/requirements.txt transformers==4.6

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting transformers==4.6
  Downloading transformers-4.6.0-py3-none-any.whl (2.3 MB)
     |████████████████████████████████| 2.3 MB 19.8 MB/s            
[?25hCollecting datasets>=1.1.3
  Downloading datasets-1.15.1-py3-none-any.whl (290 kB)
     |████████████████████████████████| 290 kB 53.8 MB/s            
[?25hCollecting sentencepiece!=0.1.92
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
     |████████████████████████████████| 1.2 MB 40.9 MB/s            
Collecting rouge-score
  Downloading rouge_score-0.0.4-py2.py3-none-any.whl (22 kB)
Collecting py7zr
  Downloading py7zr-0.16.2-py3-none-any.whl (66 kB)
     |████████████████████████████████| 66 kB 1.0 MB/s             
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 

# 预训练模型下载

In [6]:
!mkdir -p models/pretrain/pegasus
!mkdir -p models/pretrain/bart

!mkdir -p ./models/local_train/pegasus-hp
!mkdir -p ./models/local_train/bart-hp

!aws s3 cp s3://datalab2021/hupo_nlp/models/pegasus/checkpoint-46314.zip models/pretrain/pegasus
!aws s3 cp s3://datalab2021/hupo_nlp/models/bart/checkpoint-46314.zip models/pretrain/bart
    
!unzip models/pretrain/pegasus/checkpoint-46314.zip -d models/pretrain/pegasus > /dev/null 2>&1
!unzip models/pretrain/bart/checkpoint-46314.zip -d models/pretrain/bart > /dev/null 2>&1

# 模型训练

In [None]:
!python -u examples/pytorch/summarization/run_summarization.py \
--model_name_or_path google/pegasus-large \
--do_train \
--do_eval \
--per_device_train_batch_size=2 \
--per_device_eval_batch_size=1 \
--save_strategy epoch \
--evaluation_strategy epoch \
--overwrite_output_dir \
--predict_with_generate \
--train_file './data/hp/summary/news_summary_cleaned_small_train.csv' \
--validation_file './data/hp/summary/news_summary_cleaned_small_test.csv' \
--text_column 'text' \
--summary_column 'headlines' \
--output_dir='./models/local_train/pegasus-hp' \
--num_train_epochs=1.0 \
--eval_steps=500 \
--save_total_limit=3 \
--source_prefix "summarize: " > train_pegasus.log

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
loading configuration file https://huggingface.co/google/pegasus-large/resolve/main/config.json from cache at /home/ec2-user/.cache/huggingface/transformers/3fa0446657dd3714a950ba400a3fa72686d0f815da436514e4823a973ef23e20.f2dc0735a07d1a70170e8e0e4d5fb57ad90d8ea5201a0dbd4b33f2f499444852
Model config PegasusConfig {
  "_name_or_path": "google/pegasus-large",
  "activation_dropout": 0.1,
  "activation_function": "relu",
  "add_bias_logits": false,
  "add_final_layer_norm": true,
  "architectures": [
    "PegasusForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_d

In [None]:
!python -u examples/pytorch/summarization/run_summarization.py \
--model_name_or_path facebook/bart-large-cnn \
--do_train \
--do_eval \
--per_device_train_batch_size=1 \
--per_device_eval_batch_size=1 \
--save_strategy epoch \
--evaluation_strategy epoch \
--overwrite_output_dir \
--predict_with_generate \
--train_file './data/hp/summary/news_summary_cleaned_small_train.csv' \
--validation_file './data/hp/summary/news_summary_cleaned_small_test.csv' \
--text_column 'text' \
--summary_column 'headlines' \
--output_dir='./models/local_train/bart-hp' \
--num_train_epochs=1.0 \
--eval_steps=1000 \
--save_total_limit=3 \
--source_prefix "summarize: " > train_bart.log

# 本地推理

In [33]:
import pandas as pd
df=pd.read_csv('./data/hp/summary/news_summary_cleaned_small_test.csv')
print('原文:',df.loc[0,'text'])
print('真实标签:',df.loc[0,'headlines'])
from transformers import pipeline
summarizer = pipeline("summarization", model="./models/local_train/pegasus-hp/checkpoint-500")
print('模型预测:',summarizer(df.loc[0,'text'], max_length=50)[0]['summary_text'])

原文: Hybrid electric aircraft startup Zunum Aero, founded by Indian-origin entrepreneur Ashish Kumar, has received investments from aerospace companies Boeing and JetBlue. The startup intends to make regional aircrafts with space for 10 to 50 passengers for flights up to 1,600 km. "Our goal is to be part of a disruptive force rather than the one being disrupted," said JetBlue.
真实标签: Boeing, JetBlue back Indian-origin man's aircraft startup
模型预测: Hybrid electric aircraft startup Zunum Aero, founded by Indian-origin entrepreneur Ashish Kumar, has received investments from aerospace companies Boeing and JetBlue.


# 增强训练

In [12]:
!python -u examples/pytorch/summarization/run_summarization.py \
--model_name_or_path models/pretrain/pegasus/checkpoint-46314 \
--do_train \
--do_eval \
--per_device_train_batch_size=2 \
--per_device_eval_batch_size=1 \
--save_strategy epoch \
--evaluation_strategy epoch \
--overwrite_output_dir \
--predict_with_generate \
--train_file './data/hp/summary/news_summary_cleaned_small_train.csv' \
--validation_file './data/hp/summary/news_summary_cleaned_small_test.csv' \
--text_column 'text' \
--summary_column 'headlines' \
--output_dir='./models/local_train/pegasus-hp' \
--num_train_epochs=1.0 \
--eval_steps=500 \
--save_total_limit=3 \
--source_prefix "summarize: " > train_pegasus_2.log

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
loading configuration file models/pretrain/pegasus/checkpoint-46314/config.json
Model config PegasusConfig {
  "_name_or_path": "google/pegasus-large",
  "activation_dropout": 0.1,
  "activation_function": "relu",
  "add_bias_logits": false,
  "add_final_layer_norm": true,
  "architectures": [
    "PegasusForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 16,
  "decoder_start_token_id": 0,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "enco

# 本地推理

In [34]:
import pandas as pd
df=pd.read_csv('./data/hp/summary/news_summary_cleaned_small_test.csv')
print('原文:',df.loc[0,'text'])
print('真实标签:',df.loc[0,'headlines'])
from transformers import pipeline
summarizer = pipeline("summarization", model="./models/local_train/pegasus-hp/checkpoint-500")
print('模型预测:',summarizer(df.loc[0,'text'], max_length=50)[0]['summary_text'])

原文: Hybrid electric aircraft startup Zunum Aero, founded by Indian-origin entrepreneur Ashish Kumar, has received investments from aerospace companies Boeing and JetBlue. The startup intends to make regional aircrafts with space for 10 to 50 passengers for flights up to 1,600 km. "Our goal is to be part of a disruptive force rather than the one being disrupted," said JetBlue.
真实标签: Boeing, JetBlue back Indian-origin man's aircraft startup
模型预测: Boeing, JetBlue back Indian-origin man's aircraft startup


# 模型部署

In [14]:
!cp -r ./models/local_train/pegasus-hp/checkpoint-500 ./endpoint/pegasus

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
!sh build_and push.sh pegasus-hp

In [28]:
#注意修改：847380964353.dkr.ecr.ap-northeast-1.amazonaws.com/pegasus-hp为自己对应的
%cd endpoint

!python create_endpoint.py \
--endpoint_ecr_image_path "847380964353.dkr.ecr.ap-northeast-1.amazonaws.com/pegasus-hp" \
--endpoint_name 'pegasus' \
--instance_type "ml.p3.2xlarge"

%cd ..

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
model_name:  pegasus
endpoint_ecr_image_path:  847380964353.dkr.ecr.ap-northeast-1.amazonaws.com/pegasus-hp
<<< Completed model endpoint deployment. pegasus


In [None]:
from boto3.session import Session
import json
df=pd.read_csv('./data/hp/summary/news_summary_cleaned_small_test.csv')
print('原文:',df.loc[0,'text'])
print('真实标签:',df.loc[0,'headlines'])
data={"data": df.loc[0,'text']}
session = Session()
    
runtime = session.client("runtime.sagemaker")
response = runtime.invoke_endpoint(
    EndpointName='pegasus',
    ContentType="application/json",
    Body=json.dumps(data),
)

result = json.loads(response["Body"].read())
print (result)