In [None]:
!pip install sagemaker --upgrade

In [None]:
import sagemaker
sess = sagemaker.Session()
role = sagemaker.get_execution_role()


In [None]:
from sagemaker.huggingface import HuggingFace
# hyperparameters, which are passed into the training job

mpi_options = {
    "enabled" : True,
    "processes_per_host" : 8
}

smp_options = {
    "enabled":True,
    "parameters": {
        "microbatches": 2,
        "placement_strategy": "spread",
        "pipeline": "interleaved",
        "optimize": "speed",
        "partitions": 16,
#         "ddp": True,
    }
}

distribution={
    "smdistributed": {"modelparallel": smp_options},
    "mpi": mpi_options
    
}

# hyperparameters={
#     'output_dir':'/opt/ml/model',
# }

# create the Estimator
huggingface_estimator = HuggingFace(
        entry_point='train.py',
        source_dir='./train',
        instance_type='ml.p3.16xlarge',
        instance_count=2,
        role=role,
        volume_size=50,
        transformers_version='4.6.1',
        pytorch_version='1.7.1',
        py_version='py36',
        distribution = distribution,
#         hyperparameters = hyperparameters
)


In [None]:
import time

job_name="t5-MP-small-{}".format(strftime("%Y-%m-%d-%H-%M-%S", time.gmtime()))
huggingface_estimator.fit()



In [None]:
# container image used for training job
print(f"container image used for training job: \n{huggingface_estimator.image_uri}\n")

# s3 uri where the trained model is located
print(f"s3 uri where the trained model is located: \n{huggingface_estimator.model_data}\n")

# latest training job name for this estimator
print(f"latest training job name for this estimator: \n{huggingface_estimator.latest_training_job.name}\n")
