In [3]:
import boto3

account_id = boto3.client('sts').get_caller_identity().get('Account')
region = boto3.Session().region_name
ecr_repository = 'asdi-hackathon-sagemaker-processing-container'
tag = ':latest'
processing_repository_uri = '{}.dkr.ecr.{}.amazonaws.com/{}'.format(account_id, region, ecr_repository + tag)

In [1]:
# Create ECR repository and push docker image
# Dockerfile in this directory hence '.'
!docker build -t $ecr_repository .
!aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com
!aws ecr create-repository --repository-name $ecr_repository
!docker tag {ecr_repository + tag} $processing_repository_uri
!docker push $processing_repository_uri

Sending build context to Docker daemon  120.8kB
Step 1/5 : FROM python:3.7-slim-buster
 ---> 15cc9ba5a32a
Step 2/5 : RUN pip3 install pandas scikit-learn numpy multiprocess boto3
 ---> Running in 88cd808f339a
Collecting pandas
  Downloading pandas-1.3.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 11.3/11.3 MB 90.3 MB/s eta 0:00:00
Collecting scikit-learn
  Downloading scikit_learn-1.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (24.8 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 24.8/24.8 MB 63.5 MB/s eta 0:00:00
Collecting numpy
  Downloading numpy-1.21.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 15.7/15.7 MB 23.4 MB/s eta 0:00:00
Collecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 115.1/115.1 KB 1.8 MB/s eta 0:00:00
Collecting boto3
  Downloading 

In [1]:
!docker logout

Removing login credentials for https://index.docker.io/v1/


In [4]:
from sagemaker import get_execution_role
from sagemaker.processing import ScriptProcessor

#role = 'role_name_with_sagemaker_permissions' #i.e. my sagemaker role otherwise proceed inside sagemaker and will auto find
role = get_execution_role()

script_processor = ScriptProcessor(command=['python3'], 
                            image_uri=processing_repository_uri, role=role, 
                            instance_type="ml.t3.xlarge", instance_count=1)

In [None]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

script_processor.run(
    code='processing.py',
    #no inputs technically needed as script is self-sufficient in this regard
    outputs=[ProcessingOutput(output_name="final_df", source="/opt/ml/processing/output")]
)

processing_job_description = script_processor.jobs[-1].describe()
print(processing_job_description)