# 权限配置

In [None]:
import sagemaker
import os
sess = sagemaker.Session()
role = sagemaker.get_execution_role()

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

# 数据准备

In [103]:
# dataset used
dataset_name = 'ruanhua2'
# s3 key prefix for the data
s3_prefix = 'datasets/ruanhua'
WORK_DIRECTORY = './data/'
data_location = sess.upload_data(WORK_DIRECTORY, key_prefix=s3_prefix)
data_location

's3://sagemaker-us-west-2-847380964353/datasets/ruanhua'

# 超参数定义

In [109]:
from sagemaker.huggingface import HuggingFace


# hyperparameters which are passed to the training job
hyperparameters={'reference_column':'ref',
                 'hypothesis_column':'hyp',
                 'train_file':'/opt/ml/input/data/train/parasci_train.csv',
                 'validation_file':'/opt/ml/input/data/test/parasci_val.csv',
                 'output_dir':'/opt/ml/model',
                 'do_train':True,
                 'do_eval':True,
                 'max_source_length': 128,
                 'max_target_length': 128,
                 'model_name_or_path': 't5-large',
                 'learning_rate': 3e-4,
                 'num_train_epochs': 50,
                 'per_device_train_batch_size': 16,
                 'gradient_accumulation_steps':2, 
                 'save_strategy':'epoch',
                 'evaluation_strategy':'epoch',
                 'save_total_limit':1,
                 }
distribution = {'smdistributed':{'dataparallel':{ 'enabled': True }}}
# create the Estimator
huggingface_estimator = HuggingFace(
        entry_point='run_paraphrase.py',
        source_dir='./scripts',
        instance_type='ml.p3dn.24xlarge',
        instance_count=1,
        role=role,
        max_run=24*60*60,
        transformers_version='4.6',
        pytorch_version='1.7',
        py_version='py36',
        hyperparameters = hyperparameters,
        distribution=distribution
)

# 模型训练

In [None]:
huggingface_estimator.fit(
  {'train': data_location+'/parasci_train.csv',
   'test': data_location+'/parasci_val.csv'}
)

# 模型加载&部署

In [6]:
from sagemaker.huggingface.model import HuggingFaceModel

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
#    env= {'HF_TASK':'text-generation'},
   model_data="s3://sagemaker-us-west-2-847380964353/huggingface-pytorch-training-2022-03-30-02-45-50-596/output/model.tar.gz",  # path to your trained SageMaker model
   role=role,                                            # IAM role with permissions to create an endpoint
   transformers_version="4.6",                           # Transformers version used
   pytorch_version="1.7",                                # PyTorch version used
   py_version='py36',                                    # Python version used
    
)

In [7]:
predictor = huggingface_model.deploy(
   initial_instance_count=1,
   instance_type="ml.m5.2xlarge"
)

---------!

# endpoint调用

In [113]:
out=predictor.predict({
        'inputs': i+"Part of why it’s so difficult to begin is that we are dreading the task--perhaps because we don’t know what we want to write about yet, the assignment grade is weighted heavily, we fear doing poorly, or the topic is boring.",
        "parameters": {"max_length": 256},
})
out

[{'generated_text': 'it may be that we are afraid to write about it , because the assignment grade is so heavy that we fear doing poorly , or the topic is boring , but we know that it will be hard to write about anyway .'}]

In [114]:
profix=['Low level paraphrase:','Medium level paraphrase:','High level paraphrase:']
for i in profix:
    out=predictor.predict({
        'inputs': i+"Part of why it’s so difficult to begin is that we are dreading the task--perhaps because we don’t know what we want to write about yet, the assignment grade is weighted heavily, we fear doing poorly, or the topic is boring.",
        "parameters": {"max_length": 256},
    })
#     print(out)
    print(i+out[0]['generated_text'])


Low leval paraphrase:part of why it’s so difficult to begin is that we are dreading the task--perhaps because we don’t know what we want to write about yet, the assignment grade is weighted heavily , we fear doing poorly , or the topic is boring .
Medium leval paraphrase:part of the difficulty is that we are intimidated by the task itself , because we know what we want to write about , the grade is heavy, we fear doing poorly , or the topic is boring .
High leval paraphrase:it may be that we are afraid to write about it , because the assignment grade is so heavy that we fear doing poorly , or the topic is boring , but we know that it will be hard to write about anyway .


# 本地训练

In [None]:
!python scripts/run_paraphrase.py \
    --model_name_or_path t5-small \
    --do_train \
    --do_eval \
    --train_file data/parasci_train.csv \
    --validation_file data/parasci_val.csv \
    --output_dir /tmp/tst-summarization \
    --overwrite_output_dir \
    --save_strategy 'epoch' \
    --reference_column 'ref' \
    --num_train_epochs 1 \
    --hypothesis_column 'hyp' \
    --max_source_length 128 \
    --output_dir models \
    --max_target_length 128 \
    --per_device_train_batch_size=4 \
    --per_device_eval_batch_size=4 \
    --predict_with_generate

In [7]:
!pip install -r scripts/requirements.txt

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting datasets>=1.1.3
  Downloading datasets-2.0.0-py3-none-any.whl (325 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m325.5/325.5 KB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
Collecting rouge-score
  Downloading rouge_score-0.0.4-py2.py3-none-any.whl (22 kB)
Collecting py7zr
  Downloading py7zr-0.18.3-py3-none-any.whl (76 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 KB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.1/212.1 KB[0m [31m48.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting pyzstd>=0.14.4
  Downloading pyzstd-0.15.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl