### Development Environment and Permissions

Based on https://github.com/huggingface/notebooks/blob/main/sagemaker/01_getting_started_pytorch/sagemaker-notebook.ipynb

In [73]:
import sagemaker.huggingface
from dotenv import load_dotenv
import os

load_dotenv()  # take environment variables from .env.

True

### Permissions


In [78]:
import sagemaker
import boto3

ACCESS_KEY = os.environ.get("AWS_SAGEMAKER_ACCESS_KEY")
SECRET_KEY = os.environ.get("AWS_SAGEMAKER_SECRET_ACCESS_KEY")
REGION = os.environ.get("AWS_BUCKET_REGION")

iam_client = boto3.client('iam', aws_access_key_id=ACCESS_KEY,
                          aws_secret_access_key=SECRET_KEY)

boto_session = boto3.Session(aws_access_key_id=ACCESS_KEY,
                             aws_secret_access_key=SECRET_KEY, region_name=REGION)

role = iam_client.get_role(
    RoleName='sagemaker')['Role']['Arn']
sess = sagemaker.Session(boto_session=boto_session)


In [89]:
from sagemaker.huggingface import HuggingFace
from sagemaker.workflow.parameters import ParameterInteger, ParameterFloat, ParameterString

# Hyperparameters will be provided by user
USER_ID = "ztvuzveD7nZRRFYCgc9gvnagTwr2"
USER_GENDER = "woman"

# Model storage paths
MODEL_BUCKET = "sagemaker-user-training"
model_output_path = f's3://{MODEL_BUCKET}/{USER_ID}/'
model_checkpoint_path = f's3://{MODEL_BUCKET}/{USER_ID}/checkpoints'

# Input data paths
IMAGES_BUCKET = "sagemaker-user-io"
instance_input_path = f's3://{IMAGES_BUCKET}/{USER_ID}/uploads'
class_input_path = f's3://{IMAGES_BUCKET}/class_images/{USER_GENDER}'

# instance configurations
instance_type='ml.g4dn.4xlarge'
instance_count=1

# Spot parameters
max_run_sec = 7200
tolerance_sec = 7200
max_wait_sec = tolerance_sec + max_run_sec

# depth mask (input to inference) images
inference_parameters = {
    "s3_results_storage_bucket": IMAGES_BUCKET,             # Bucket where the inference results will be stored in
    "s3_results_storage_key": f"{USER_ID}/inference",       # Folder where the inference results will be stored in
    "s3_depthmap_images_storage_bucket": IMAGES_BUCKET,       # Bucket where the input (depth masks) to the inference pipeline are stored in
    "s3_depthmap_images_storage_key": "depthmask_images",     # Folder where the input (depth masks) to the inference pipeline are stored in
    "prompt": f"'Professional portrait photo of qonvhs {USER_GENDER} in formal clothing, perfect eyes, studio lighting, HDR, UHD, 4K, 8k, 64K, highly detailed, portrait, 40mm lens, shallow depth of field, close up, studio lighting'",
    "negative_prompt": "'blender, closed eyes, jewlery, chain,ugly, different eye color, multiple hands, bad anatomy, bad proportions, unrealistic, full body, cropped, lowres, poorly drawn face, out of frame, poorly drawn hands, double, blurred, disfigured, deformed, repetitive, black and white'",
    "guidance_scale": 7.5,
    "infer_steps": 70,
    "n_inferences": 15,
    "num_images_per_prompt": 1,
}

# hyperparameters which are passed to the training job
hyperparameters = {
    "instance_prompt": f"'Photo of qonvhs {USER_GENDER}'",
    "class_prompt": f"'Photo of {USER_GENDER}'",
    "model_checkpoint_dir": "/opt/ml/checkpoints",      # Where we store the checkpoints -> Automatically syched with S3 bucket. Handeled by Sagemaker
    "checkpointing_interval": 500,                      # Store checkpoints every N epochs
    "with_prior_preservation": True,
    "prior_loss_weight": 1.0,
    "num_class_images": 100,
    "class_batch_size": 4,
    "seed": 1337,
    "train_text_encoder": True,
    "train_batch_size": 1,
    "max_train_steps": 3000,
    "learning_rate": 1e-6,
    "lr_scheduler": "constant",
    "lr_warmup_step": 0,
    "gradient_accumulation_steps": 1,
    "mixed_precision": "no",
    "use_8bit_adam":True,
    "gradient_checkpointing": True,
    "s3_results_storage_bucket": inference_parameters["s3_results_storage_bucket"],
    "s3_results_storage_key": inference_parameters["s3_results_storage_key"],
    "s3_depthmap_images_storage_bucket": inference_parameters["s3_depthmap_images_storage_bucket"],
    "s3_depthmap_images_storage_key": inference_parameters["s3_depthmap_images_storage_key"],
    "prompt": inference_parameters["prompt"],
    "negative_prompt": inference_parameters["negative_prompt"],
    "guidance_scale": inference_parameters["guidance_scale"],
    "infer_steps": inference_parameters["infer_steps"],
    "n_inferences": inference_parameters["n_inferences"],
    "num_images_per_prompt": inference_parameters["num_images_per_prompt"], 
}

In [90]:

# create the Estimator
huggingface_estimator = HuggingFace(
    base_job_name           = USER_ID,
    role                    = role,
    entry_point             = 'train.py',
    source_dir              = "./train_spot",
    instance_type           = instance_type,
    instance_count          = instance_count,
    transformers_version    = '4.17',
    pytorch_version         = '1.10',
    py_version              = 'py38',
    hyperparameters         = hyperparameters,
    sagemaker_session       = sess,
    output_path             = model_output_path,
	use_spot_instances      = True,                     # wether to use spot instances or not
    max_wait                = max_wait_sec,             # max time including spot start + training time
    max_run                 = max_run_sec,              # max expected training time
	checkpoint_s3_uri       = model_checkpoint_path,    # s3 uri where our checkpoints will be uploaded during training
)

In [91]:
# define a data input dictonary with our uploaded s3 uris
data = {
    'train': instance_input_path,
    'test': class_input_path
}

# starting the train job with our uploaded datasets as input
huggingface_estimator.fit(data, wait=True)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: ztvuzveD7nZRRFYCgc9gvnagTwr2-2022-12-30-19-44-47-695


2022-12-30 19:44:48 Starting - Starting the training job...
2022-12-30 19:45:04 Starting - Preparing the instances for training.........
2022-12-30 19:46:42 Downloading - Downloading input data...........................
2022-12-30 19:50:59 Training - Training image download completed. Training in progress...[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2022-12-30 19:51:39,564 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2022-12-30 19:51:39,588 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2022-12-30 19:51:39,591 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2022-12-30 19:51:39,738 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m
[34m/opt/conda/bin/python3.8 -m pip install -r requirements.txt[0m
[34mCollecting git