## Notebook to create the pipeline

In [1]:
import sagemaker
import boto3
import os

from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
    ParameterFloat,
)
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.pipeline import PipelineDefinitionConfig
from sagemaker import image_uris
from steps.processor import get_processor_step
from steps.evaluator import get_evaluator_step
from steps.trainer import get_trainer_step
from steps.lambda_step import get_lambda_step
from steps.waiter import get_wait_step
from steps.deployment import get_deployment_step
from steps.condition import get_conditional_step

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
def get_parameters() -> dict:
    # - Common --------------------------------------
    random_seed = ParameterString(
        name="RandomSeed",
        default_value="1"
    )
    # - Processing ----------------------------------
    process_instance_count = ParameterInteger(
        name="ProcessingInstanceCount",
        default_value=1
    )
    process_instance_type = ParameterString(
        name="ProcessingInstanceType",
        default_value="ml.m5.large",
    )
    # - Trainer -------------------------------------
    tracking_server_arn = ParameterString(
        name="TrackingServerArn",
        default_value="arn:aws:sagemaker:eu-central-1:567821811420:mlflow-tracking-server/wildfire-mj",
    )
    train_instance_count = ParameterInteger(
        name="TrainInstanceCount",
        default_value=1
    )
    train_instance_type = ParameterString(
        name="TrainInstanceType",
        default_value="ml.p3.2xlarge",
    )
    train_epochs_num = ParameterInteger(
        name="NumberOfEpochs",
        default_value=10
    )
    train_batch_size = ParameterInteger(
        name="BatchSize",
        default_value=32
    )
    train_learning_rate = ParameterFloat(
        name="LearningRate",
        default_value=0.1
    )
    # - Evaluator ----------------------------------
    evaluator_instance_count = ParameterInteger(
        name="EvaluatingInstanceCount",
        default_value=1
    )
    evaluator_instance_type = ParameterString(
        name="EvaluatingInstanceType",
        default_value="ml.m5.large",
    )
    # - Deployment ---------------------------------

    return {
        "random_seed": random_seed,
        "process_instance_count": process_instance_count,
        "process_instance_type": process_instance_type,
        "tracking_server_arn": tracking_server_arn,
        "train_instance_count": train_instance_count,
        "train_instance_type": train_instance_type,
        "train_epochs_num": train_epochs_num,
        "train_batch_size": train_batch_size,
        "train_learning_rate": train_learning_rate,
        "evaluator_instance_count": evaluator_instance_count,
        "evaluator_instance_type": evaluator_instance_type
    }

## Pipeline creation function

In [3]:
def get_pipeline(
    session: sagemaker.Session,
    parameters: dict,
    constants: dict,
    sklearn_image_uri: str,
):
    pipeline_def_config = PipelineDefinitionConfig(use_custom_job_prefix=True)

    # - Processing ----------------------------------
    processor_step = get_processor_step(
        project=constants["project"],
        bucket_name=constants["bucket_name"],
        process_instance_count=parameters["process_instance_count"],
        process_instance_type=parameters["process_instance_type"],
        sklearn_image_uri=sklearn_image_uri,
        region=constants["region"],
        seed=parameters["random_seed"]
    )

    # - Trainer -------------------------------------
    trainer_step = get_trainer_step(
        project=constants["project"],
        bucket_name=constants["bucket_name"],
        tracking_server_arn=parameters["tracking_server_arn"],
        train_instance_count=parameters["train_instance_count"],
        train_instance_type=parameters["train_instance_type"],
        region=constants["region"],
        epochs_num=parameters["train_epochs_num"],
        batch_size=parameters["train_batch_size"],
        learning_rate=parameters["train_learning_rate"],
        seed=parameters["random_seed"]
    )

    # - Evaluator ----------------------------------
    evaluator_step = get_evaluator_step(
        project=constants["project"],
        bucket_name=constants["bucket_name"],
        evaluator_instance_count=parameters["evaluator_instance_count"],
        evaluator_instance_type=parameters["evaluator_instance_type"],
        evaluation_image_uri='763104351884.dkr.ecr.eu-central-1.amazonaws.com/pytorch-inference:2.3.0-gpu-py311-cu121-ubuntu20.04-ec2',
        training_step=trainer_step,

        result_prefix='evaluation/result',
        region=constants["region"],
    )

    # - Deployment ---------------------------------
    suffix = "Initial"

    check_status_step = get_lambda_step(
        project=constants["project"],
        bucket_name=constants["bucket_name"],
        process_instance_count_param=parameters["process_instance_count"],
        process_instance_type_param=parameters["process_instance_type"],
        evaluation_image_uri=sklearn_image_uri,
        region=constants["region"],

        lambda_check_function_arn='arn:aws:lambda:eu-central-1:567821811420:function:LambdaWildfireCheckStatus',
        step_suffix=suffix
    )

    wait_step = get_wait_step(
        project=constants["project"],
        bucket_name=constants["bucket_name"],
        process_instance_count_param=parameters["process_instance_count"],
        process_instance_type_param=parameters["process_instance_type"],
        evaluation_image_uri=sklearn_image_uri,
        region=constants["region"],
        evaluator_step=evaluator_step

        # model_package_arn='arn:aws:sagemaker:eu-central-1:567821811420:model-package/first-fire-mlflow-ee0049/1'
    )

    deployment_step = get_deployment_step(
        project=constants["project"],
        bucket_name=constants["bucket_name"],
        process_instance_count_param=parameters["process_instance_count"],
        process_instance_type_param=parameters["process_instance_type"],
        evaluation_image_uri=sklearn_image_uri,
        region=constants["region"],

        model_prefix='models/last',
        model_filename='model',
        evaluator_step=evaluator_step
        # model_package_arn='arn:aws:sagemaker:eu-central-1:567821811420:model-package/first-fire-mlflow-ee0049/1'
    )

    conditional_step = get_conditional_step(
        project=constants["project"],
        bucket_name=constants["bucket_name"],
        process_instance_count_param=parameters["process_instance_count"],
        process_instance_type_param=parameters["process_instance_type"],
        evaluation_image_uri=sklearn_image_uri,
        region=constants["region"],

        model_path='',
        deployment_step=deployment_step,
        wait_step=wait_step,
        model_package_arn='arn:aws:sagemaker:eu-central-1:567821811420:model-package/first-fire-mlflow-ee0049/1',
        condition_step_suffix=suffix
    )
    # ------------------------------------------------
    trainer_step.add_depends_on([processor_step])
    wait_step.add_depends_on([check_status_step])
    conditional_step.add_depends_on([wait_step])
    check_status_step.add_depends_on([evaluator_step])
    check_status_step.add_depends_on([trainer_step])

    return Pipeline(
        name=f"{constants['project']}-pipeline",
        parameters=[parameters[key] for key in parameters],
        pipeline_definition_config=pipeline_def_config,
        steps=[
            processor_step,
            trainer_step,
            evaluator_step,
            check_status_step,
            wait_step,
            conditional_step
        ],
    )

In [4]:
parameters = get_parameters()

constants = {
    "region": "eu-central-1",
    "project": "wildfire-project",
    "bucket_name": "wildfires",
    "sklearn_image_uri_version": "1.2-1",
}

session = sagemaker.Session(boto3.Session(region_name=constants["region"]))

sklearn_image_uri = image_uris.retrieve(
    framework="sklearn",
    region=constants["region"],
    version=constants["sklearn_image_uri_version"],
)

pipeline = get_pipeline(
    session=session,
    parameters=parameters,
    constants=constants,
    sklearn_image_uri=sklearn_image_uri,
)

pipeline.upsert(role_arn=sagemaker.get_execution_role())

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker.image_uris:Defaulting to only supported image scope: cpu.


Starting get_lambda_step
Starting conditional_step


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.


{'PipelineArn': 'arn:aws:sagemaker:eu-central-1:567821811420:pipeline/wildfire-project-pipeline',
 'ResponseMetadata': {'RequestId': '532c2c89-e8a4-40ee-a4ad-ce63cb593a5c',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '532c2c89-e8a4-40ee-a4ad-ce63cb593a5c',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '96',
   'date': 'Fri, 28 Jun 2024 06:17:38 GMT'},
  'RetryAttempts': 0}}