In [1]:
from datetime import datetime
from os import environ
import sagemaker
from sagemaker.estimator import Estimator

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/idan/Library/Application Support/sagemaker/config.yaml


## Build and push image

Change the env vars in the shell scripts, according to your project

In [2]:
data_center = "us"

if data_center == "us":
    bucket = "quicklizard"
    aws_region = "us-east-1"
elif data_center == "eu":
    bucket = "quicklizard-eu-central"
    aws_region = "eu-central-1"

# img_tag = "qa-1.0"
img_tag = "0.1.0"
proj_name = "elasticity"
environ["AWS_REGION"] = aws_region
environ["AWS_DEFAULT_REGION"] = aws_region

# aws ecr create-repository --repository-name $PROJ_NAME --region $AWS_REGION

In [None]:
%%sh -s $img_tag $proj_name
make TAG="$1" PROJ="$2"

# chmod 755 containerize_m1.sh
# ./containerize_m1.sh "$1"


Building docker image elasticity:0.1.0 for US data center
DOCKER_BUILDKIT=1 docker build --platform linux/amd64 --ssh default -t 444642203706.dkr.ecr.us-east-1.amazonaws.com/elasticity:0.1.0 .


#0 building with "desktop-linux" instance using docker driver

#1 [internal] load build definition from Dockerfile
#1 transferring dockerfile: 649B done
#1 DONE 0.0s

#2 [internal] load metadata for docker.io/library/python:3.11.6-slim-bookworm
#2 ...

#3 [auth] library/python:pull token for registry-1.docker.io
#3 DONE 0.0s

#2 [internal] load metadata for docker.io/library/python:3.11.6-slim-bookworm
#2 DONE 1.8s

#4 [internal] load .dockerignore
#4 transferring context: 300B done
#4 DONE 0.0s

#5 [internal] load build context
#5 transferring context: 143.96kB done
#5 DONE 0.0s

#6 [1/8] FROM docker.io/library/python:3.11.6-slim-bookworm@sha256:cc758519481092eb5a4a5ab0c1b303e288880d59afc601958d19e95b300bc86b
#6 resolve docker.io/library/python:3.11.6-slim-bookworm@sha256:cc758519481092eb5a4a5ab0c1b303e288880d59afc601958d19e95b300bc86b done
#6 sha256:cc758519481092eb5a4a5ab0c1b303e288880d59afc601958d19e95b300bc86b 1.65kB / 1.65kB done
#6 sha256:38a28170d13a276d42b7dd55cae54440b581d773


Pushing image elasticity:0.1.0 to AWS ECR (US)
Login Succeeded
docker push 444642203706.dkr.ecr.us-east-1.amazonaws.com/elasticity:0.1.0
The push refers to repository [444642203706.dkr.ecr.us-east-1.amazonaws.com/elasticity]
2189357b71a5: Preparing
901953c7c8ca: Preparing
43ccdcb05d44: Preparing
2f4fd8898491: Preparing
bf9c3c10bb96: Preparing
5aa7f19ab2a0: Preparing
1f2542200b9c: Preparing
d8815e8a268d: Preparing
8655910e6b5f: Preparing
355bb094feb8: Preparing
ed123c9f1a56: Preparing
92770f546e06: Preparing
1f2542200b9c: Waiting
d8815e8a268d: Waiting
8655910e6b5f: Waiting
355bb094feb8: Waiting
ed123c9f1a56: Waiting
92770f546e06: Waiting
5aa7f19ab2a0: Waiting
2189357b71a5: Pushed
2f4fd8898491: Pushed
bf9c3c10bb96: Pushed
5aa7f19ab2a0: Pushed
1f2542200b9c: Pushed
d8815e8a268d: Pushed
8655910e6b5f: Pushed
ed123c9f1a56: Pushed
355bb094feb8: Pushed
92770f546e06: Pushed
43ccdcb05d44: Pushed


## Prepare Sagemaker params

In [5]:
role = environ.get("SGMKR_ROLE")
# role = sagemaker.get_execution_role()

s3_output_dir = f"s3://{bucket}/data_science/sagemaker/job_artifacts/{proj_name}"

In [11]:
def get_sess_params(is_local: bool):
    if is_local:
        sess = sagemaker.LocalSession(default_bucket=bucket)
        train_instance = "local"
        deploy_instance = "local"
    else:
        sess = sagemaker.Session(default_bucket=bucket)
        train_instance = "ml.m5.4xlarge"
        deploy_instance = "ml.m5.2xlarge"
    account_id = sess.boto_session.client('sts').get_caller_identity()['Account']
    region = sess.boto_session.region_name
    return sess, train_instance, deploy_instance, account_id, region

## Run Sagemaker job in local mode

In [None]:
job_id = f"{proj_name}-{datetime.strftime(datetime.now(), format='%Y-%m-%dT%H-%M')}"

print(f"Preparing to run local-{job_id}:{img_tag}")

sess, train_instance, deploy_instance, account_id, region = get_sess_params(is_local=True)

estimator_local = Estimator(
    image_uri=f"{account_id}.dkr.ecr.{region}.amazonaws.com/{proj_name}:{img_tag}",
    sagemaker_sess=sess,
    role=role,
    instance_type=train_instance,
    instance_count=1,
    output_path=s3_output_dir,
    hyperparameters={
        "data_center": data_center,
        "config": "config_qa",
    }
)

estimator_local.fit(job_name=f"local-{job_id}")

## Run on EC2

In [15]:
job_id = f"{proj_name}-complete-run-test-m5x12-{datetime.strftime(datetime.now(), format='%Y-%m-%dT%H-%M')}"

print(f"Preparing to run {job_id}:{img_tag}")

sess, train_instance, deploy_instance, account_id, region = get_sess_params(is_local=False)

estimator_local = Estimator(
    image_uri=f"{account_id}.dkr.ecr.{region}.amazonaws.com/{proj_name}:{img_tag}",
    sagemaker_sess=sess,
    role=role,
    use_spot_instances=True,
    max_run=21600,
    max_wait=21600,
    instance_type=train_instance,
    volume_size=12,
    instance_count=1,
    output_path=s3_output_dir,
    hyperparameters={
        "data_center": data_center,
        "config": "config"
    }
)

estimator_local.fit(job_name=f"{job_id}")

Preparing to run forecast-complete-run-test-m5x12-2023-12-26T14-02:2.1.0
sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/idan/Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/idan/Library/Application Support/sagemaker/config.yaml


INFO:sagemaker:Creating training-job with name: forecast-complete-run-test-m5x12-2023-12-26T14-02


2023-12-26 13:02:11 Starting - Starting the training job...
2023-12-26 13:02:26 Starting - Preparing the instances for training......
2023-12-26 13:03:35 Downloading - Downloading the training image.........
2023-12-26 13:04:56 Training - Training image download completed. Training in progress.....[34mImporting plotly failed. Interactive plots will not work.[0m
[34m[2023-12-26 13:05:42] [INFO    ]: Setting up runtime environment... (env_setup.py/run_setup)[0m
[34mReading arguments from /opt/ml/input/config (inside container)[0m
[34mJob CLI args:[0m
[34mconfig: config (<class 'str'>)[0m
[34mdata_center: eu (<class 'str'>)[0m
[34mlocal: False (<class 'bool'>)[0m
[34m[2023-12-26 13:05:42] [INFO    ]: Running on the EU Data Center (env_setup.py/run_setup)[0m
[34m[2023-12-26 13:05:42] [INFO    ]: [- S3 -] Getting file quicklizard-eu-central/data_science/sagemaker/config_files/forecast_2/config.yaml from S3... (s3_utils.py/get_s3_bytes_file)[0m
[34m[2023-12-26 13:05:42] [I