### Development Environment and Permissions

Based on https://github.com/huggingface/notebooks/blob/main/sagemaker/01_getting_started_pytorch/sagemaker-notebook.ipynb

In [1]:
import sagemaker.huggingface
from dotenv import load_dotenv
import os

load_dotenv()  # take environment variables from .env.

True

### Permissions


In [2]:
import sagemaker
import boto3

ACCESS_KEY = os.environ.get("AWS_SAGEMAKER_ACCESS_KEY")
SECRET_KEY = os.environ.get("AWS_SAGEMAKER_SECRET_ACCESS_KEY")
REGION = os.environ.get("AWS_BUCKET_REGION")

iam_client = boto3.client('iam', aws_access_key_id=ACCESS_KEY,
                          aws_secret_access_key=SECRET_KEY)

boto_session = boto3.Session(aws_access_key_id=ACCESS_KEY,
                             aws_secret_access_key=SECRET_KEY, region_name=REGION)

role = iam_client.get_role(
    RoleName='sagemaker')['Role']['Arn']
sess = sagemaker.Session(boto_session=boto_session)


In [3]:
from sagemaker.huggingface import HuggingFace
from sagemaker.workflow.parameters import ParameterInteger, ParameterFloat, ParameterString

# User id will be provided by request later
USER_ID = "testing-test"
USER_GENDER = "woman"

# Store training results
MODEL_BUCKET = "sagemaker-user-training"
model_output_path = f's3://{MODEL_BUCKET}/{USER_ID}/'
PRETRAINED_DEPTH_MODEL_PATH = f's3://{MODEL_BUCKET}/models/stable-diffusion-2-depth/'
PRETRAINED_VAE_MODEL_PATH = f's3://{MODEL_BUCKET}/models/sd-vae-ft-mse/'

# Store instance images as train images and class images as test images
IMAGES_BUCKET = "sagemaker-user-io"
instance_input_path = f's3://{IMAGES_BUCKET}/{USER_ID}/uploads'
class_input_path = f's3://{IMAGES_BUCKET}/class_images/man'

# hyperparameters which are passed to the training job
hyperparameters = {
    #"pretrained_depth_model_path": PRETRAINED_DEPTH_MODEL_PATH, -> Does not work out of the box.
    #"pretrained_vae_path": PRETRAINED_VAE_MODEL_PATH,  -> Probably best to simply use Huggingface
    "instance_prompt": f"'Photo of qonvhs {USER_GENDER}'",
    "class_prompt": f"'Photo of {USER_GENDER}'",
    "with_prior_preservation": True,
    "prior_loss_weight": 1.0,
    "num_class_images": 100,
    "class_batch_size": 4,
    "seed": 1337,
    "train_text_encoder": True,
    "train_batch_size": 1,
    "max_train_steps": 2000,
    "learning_rate": 1e-6,
    "lr_scheduler": "constant",
    "lr_warmup_step": 0,
    "gradient_accumulation_steps": 1,
    "mixed_precision": "fp16",
    "use_8bit_adam":True,
    "gradient_checkpointing": True,
}

# configuration for running training on smdistributed Data Parallel
#distribution = {'smdistributed':{'dataparallel':{ 'enabled': True }}}
# smdistributed only supported for instance types:('ml.p3.16xlarge', 'ml.p3dn.24xlarge', 'ml.p4d.24xlarge', 'ml.p4de.24xlarge', 'local_gpu')
#distribution = {
#    "mpi": {
#        "enabled": True
#    }
#}
# https://www.youtube.com/watch?v=OoNDw-Mcn0Q

# instance configurations
instance_type='ml.g4dn.4xlarge'
instance_count=1
# Volume size is in GB the storage needed for training data -> Not a lot for us, default is enough
#volume_size=200

# create the Estimator
huggingface_estimator = HuggingFace(
    base_job_name=USER_ID,
    role=role,
    entry_point='train.py',
    source_dir="./train",
    instance_type=instance_type,
    instance_count=instance_count,
    transformers_version='4.17',
    pytorch_version='1.10',
    py_version='py38',
    hyperparameters=hyperparameters,
    sagemaker_session=sess, #! very important
    output_path=model_output_path,
    #distribution= distribution,
)

<sagemaker.huggingface.estimator.HuggingFace object at 0x174919db0>


In [17]:
# define a data input dictonary with our uploaded s3 uris
data = {
    'train': instance_input_path,
    'test': class_input_path
}

# starting the train job with our uploaded datasets as input
huggingface_estimator.fit(data, wait=True)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: testing-test-2022-12-28-21-27-24-901


2022-12-28 21:27:25 Starting - Starting the training job...
2022-12-28 21:27:39 Starting - Preparing the instances for training......
2022-12-28 21:28:40 Downloading - Downloading input data...
2022-12-28 21:29:00 Training - Downloading the training image........................
2022-12-28 21:33:21 Training - Training image download completed. Training in progress........[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2022-12-28 21:34:31,842 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2022-12-28 21:34:31,865 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2022-12-28 21:34:31,867 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2022-12-28 21:34:32,048 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m
[34m/opt/conda/bin/pyth

## Status of training job

In [None]:
sess.describe_training_job