# 权限配置

In [29]:
import sagemaker
import os
sess = sagemaker.Session()
role = sagemaker.get_execution_role()

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::064542430558:role/service-role/AmazonSageMaker-ExecutionRole-20200803T154438
sagemaker bucket: sagemaker-us-west-2-064542430558
sagemaker session region: us-west-2


# 数据准备

In [40]:
# dataset used
dataset_name = 'shulex'
# s3 key prefix for the data
s3_prefix = 'lhr-data/shulex'
WORK_DIRECTORY = './data/'
data_location = sess.upload_data(WORK_DIRECTORY, key_prefix=s3_prefix)
data_location

's3://sagemaker-us-west-2-064542430558/lhr-data/shulex'

# 超参数定义

In [41]:
from sagemaker.huggingface import HuggingFace

# hyperparameters which are passed to the training job
hyperparameters={'reference_column':'ref',
                 'hypothesis_column':'hyp',
                 'train_file':'/opt/ml/input/data/train/shulexv2_train.csv',
                 'validation_file':'/opt/ml/input/data/test/shulexv2_dev.csv',
                 'output_dir':'/opt/ml/model',
                 'do_train':True,
                 'do_eval':True,
                 'max_source_length': 128,
                 'max_target_length': 64,
                 'model_name_or_path': 't5-base',
                 'learning_rate': 3e-4,
                 'num_train_epochs': 20,
                 'per_device_train_batch_size': 16,#16
                 'gradient_accumulation_steps':2, 
                 'save_strategy':'epoch',
                 'evaluation_strategy':'epoch',
                 'save_total_limit':1,
                 }
distribution = {'smdistributed':{'dataparallel':{ 'enabled': True }}}
# create the Estimator
huggingface_estimator = HuggingFace(
        entry_point='run_paraphrase.py',
        source_dir='./scripts',
        instance_type='ml.p3.2xlarge',#'ml.p3dn.24xlarge'
        instance_count=1,
        role=role,
        max_run=24*60*60,
        transformers_version='4.6',
        pytorch_version='1.7',
        py_version='py36',
        volume_size=128,
        hyperparameters = hyperparameters,
#         distribution=distribution
)

# 模型训练

In [None]:
huggingface_estimator.fit(
  {'train': data_location+'/shulexv2_train.csv',
   'test': data_location+'/shulexv2_dev.csv'}
)

2022-09-26 15:07:35 Starting - Starting the training job...
2022-09-26 15:08:05 Starting - Preparing the instances for trainingProfilerReport-1664204855: InProgress
.........
2022-09-26 15:09:21 Downloading - Downloading input data...
2022-09-26 15:10:01 Training - Downloading the training image.....................
2022-09-26 15:13:24 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2022-09-26 15:13:26,832 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2022-09-26 15:13:26,864 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2022-09-26 15:13:26,871 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2022-09-26 15:13:27,361 sagemaker-training-toolkit INFO     Installing dependencies from requirements.tx

# 模型加载&部署

In [43]:
from sagemaker.huggingface.model import HuggingFaceModel

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
#    env= {'HF_TASK':'text-generation'},
   model_data="s3://sagemaker-us-west-2-064542430558/huggingface-pytorch-training-2022-09-26-15-07-35-034/output/model.tar.gz",  # path to your trained SageMaker model
   role=role,                                            # IAM role with permissions to create an endpoint
   transformers_version="4.6",                           # Transformers version used
   pytorch_version="1.7",                                # PyTorch version used
   py_version='py36',                                    # Python version used
    
)

In [44]:
predictor = huggingface_model.deploy(
   initial_instance_count=1,
   instance_type="ml.g4dn.xlarge"
)

-------!

# endpoint调用

In [36]:
from sagemaker.huggingface.model import HuggingFacePredictor
predictor=HuggingFacePredictor(endpoint_name='huggingface-pytorch-inference-2022-09-26-14-20-39-688')

In [37]:
import time
s=time.time()
# profix=['usage location : ', 'application : ', 'device : ', 'identity : ']
profix=['location of use : ', 'purchase intention : ', 'time of use : ', 'target consumer : ']
for i in profix:
    out=predictor.predict({
        'inputs': [i+"it is in my shower and together with the teak stool , makes it feel like i 'm at a spa whenever i shower ."],
        "parameters": {"max_length": 256},
    })
    print(out)
e=time.time()
print(e-s)


[{'generated_text': 'location of use is in my shower'}]
[{'generated_text': 'purchase intention is None'}]
[{'generated_text': 'time of use is None'}]
[{'generated_text': 'target consumer is None'}]
1.1650428771972656


In [19]:
import pandas as pd
data = pd.read_csv('./data/shulex_dev.csv')
data.loc[121,'ref']

"application : it is in my shower and together with the teak stool , makes it feel like i 'm at a spa whenever i shower ."

# 本地训练

In [6]:
!python scripts/run_paraphrase.py \
    --model_name_or_path t5-small \
    --do_train \
    --do_eval \
    --train_file data/shulex_train.csv \
    --validation_file data/shulex_dev.csv \
    --output_dir /tmp/tst-summarization \
    --overwrite_output_dir \
    --save_strategy 'epoch' \
    --reference_column 'ref' \
    --num_train_epochs 1 \
    --hypothesis_column 'hyp' \
    --max_source_length 128 \
    --output_dir models \
    --max_target_length 128 \
    --per_device_train_batch_size=4 \
    --per_device_eval_batch_size=4 \
    --predict_with_generate

You're running a t5 model but didn't provide a source prefix, which is the expected, e.g. with `--source_prefix 'summarize: ' `
09/07/2022 07:36:03 - INFO - __main__ -   Training/evaluation parameters Seq2SeqTrainingArguments(output_dir='models', overwrite_output_dir=True, do_train=True, do_eval=True, do_predict=False, evaluation_strategy=<IntervalStrategy.NO: 'no'>, prediction_loss_only=False, per_device_train_batch_size=4, per_device_eval_batch_size=4, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=1.0, max_steps=-1, lr_scheduler_type=<SchedulerType.LINEAR: 'linear'>, warmup_ratio=0.0, warmup_steps=0, logging_dir='runs/Sep07_07-36-03_ip-172-16-69-247.us-west-2.compute.internal', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=False, logging_steps=500, save_stra

In [26]:
!pip install -r scripts/requirements.txt

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting transformers==4.6.1
  Using cached transformers-4.6.1-py3-none-any.whl (2.2 MB)
Collecting tokenizers<0.11,>=0.10.1
  Using cached tokenizers-0.10.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 KB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting huggingface-hub==0.0.8
  Using cached huggingface_hub-0.0.8-py3-none-any.whl (34 kB)
Collecting datasets>=1.1.3
  Downloading datasets-2.3.2-py3-none-any.whl (362 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.3/362.3 KB[0m [31m59.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Downloading datasets-2.3.1-py3-none-any.whl (362 kB)
[2K     [90m━━━━━━━━━