In [364]:
# Imports

import sagemaker
from sagemaker import image_uris
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.pytorch import PyTorch
from sagemaker.workflow.step_collections import RegisterModel
from sagemaker.workflow.lambda_step import LambdaStep
from sagemaker.lambda_helper import Lambda
from sagemaker.pytorch.model import PyTorchModel
from sagemaker.workflow.functions import Join
from sagemaker.model_monitor import DefaultModelMonitor
from sagemaker.model_monitor.dataset_format import DatasetFormat
from sagemaker.workflow.check_job_config import CheckJobConfig
from sagemaker.workflow.quality_check_step import DataQualityCheckConfig
from sagemaker.workflow.monitor_batch_transform_step import MonitorBatchTransformStep
from sagemaker.workflow.steps import CreateModelStep



In [365]:
# General Pipeline Information

session = sagemaker.Session()
sagemaker_role = sagemaker.get_execution_role()

pipeline_name = "cs401FinalBirdclefPipeline"
region = "us-east-1"

In [366]:
# Step 1: Pre-Processing. Turns .ogg files into Spectograms

spectro_processor = ScriptProcessor(
    role=sagemaker_role,
    image_uri=image_uris.retrieve("pytorch", region, "2.0.0", "py310", instance_type="ml.t3.medium", image_scope="training"),
    command=['python3'],
    instance_count=1,
    instance_type='ml.t3.medium',
    base_job_name='audio-processing',
    sagemaker_session=session
)

step_spectro = ProcessingStep(
    name="CreateSpectrograms",
    processor=spectro_processor,
    code="pipeline/process_audio.py"
)


In [367]:
# Step 2: Make manifests. Manifests link every input file to a class / label so the model can train

manifest_processor = ScriptProcessor(
    image_uri=image_uris.retrieve("pytorch", region, "2.0.0", "py310", instance_type="ml.t3.medium", image_scope="inference"),
    command=["python3"],
    role=sagemaker_role,
    instance_type="ml.t3.medium",
    instance_count=1,
    base_job_name="build-manifests",
    sagemaker_session=session,
)

step_manifest = ProcessingStep(
    name="BuildManifests",
    processor=manifest_processor,
    code="pipeline/manifest_build.py",
    job_arguments=[
        "--specs-s3-prefix", "s3://cs401finalpipelineprocessingdata/data/audio_specs/",
        "--train-csv-s3-uri", "/opt/ml/processing/input/train/train.csv",
        "--output-dir", "/opt/ml/processing/output"
    ],
    inputs=[
        ProcessingInput(
            source="s3://cs401finalpipelineprocessingdata/data/audio_specs/",
            destination="/opt/ml/processing/input/specs",
            input_name="SPECS",
        ),
        ProcessingInput(
            source="s3://cs401finalpipelineinput/train.csv",
            destination="/opt/ml/processing/input/train",
            input_name="TRAINCSV",
        ),
    ],
    outputs=[
        ProcessingOutput(
            output_name="Manifests",
            source="/opt/ml/processing/output",
        ),
    ],
)

In [368]:
# Step 3: Train the model. Train the efficient net model using the spectograms and manifests

pytorch_estimator = PyTorch(
    entry_point="pipeline/train_script_2.0.py",
    source_dir=".",
    role=sagemaker_role,
    framework_version="2.6.0",
    py_version="py312",
    instance_type="ml.m5.xlarge",
    instance_count=1,
    hyperparameters={
        "epochs": 20,
        "batch-size": 32,
        "learning-rate": 0.001,
        "accumulation-steps": 4,
        "train-manifest": "/opt/ml/input/data/manifests/train_manifest.csv",
        "val-manifest": "/opt/ml/input/data/manifests/val_manifest.csv",
    },
    dependencies=['pipeline/model_dependencies/requirements.txt']
)

step_train = TrainingStep(
    name="TrainSpectrogramModel",
    estimator=pytorch_estimator,
    inputs={
        "specs": "s3://cs401finalpipelineprocessingdata",
        "manifests": step_manifest.properties.ProcessingOutputConfig
                                  .Outputs["Manifests"]
                                  .S3Output.S3Uri,
    },
)

In [369]:
# Step 4: Package the Model. Zip the model into a tar with the model_dependencies folder (inference.py and requirements.txt)

packaging_processor = ScriptProcessor(
    image_uri=image_uris.retrieve("pytorch", region, "2.6.0", "py312", instance_type="ml.t3.medium", image_scope="inference"),
    role=sagemaker_role,
    instance_type="ml.t3.medium",
    instance_count=1,
    command=["python3"]
)

step_package = ProcessingStep(
    name="PackageModel",
    processor=packaging_processor,
    inputs=[
        ProcessingInput(
            source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
            destination="/opt/ml/processing/model"
        ),
        ProcessingInput(
            source="pipeline/model_dependencies/",
            destination="/opt/ml/processing/input/dependencies",
            input_name="model_code"
        )
    ],
    outputs=[
        ProcessingOutput(
            output_name="PackagedModel",
            source="/opt/ml/processing/output"
        )
    ],
    code="pipeline/package_model.py"
)

In [370]:
# Step 5: Create the Pytorch Model

s3_uri = Join(on="/", values=[step_package.properties.ProcessingOutputConfig.Outputs["PackagedModel"].S3Output.S3Uri, "model.tar.gz"])

pytorch_model = PyTorchModel(
    model_data=s3_uri,
    role=sagemaker_role,
    framework_version="2.6.0",
    py_version="py312",
    sagemaker_session=session,
)


step_create_model = CreateModelStep(
    name="CreateBirdCLEFModelResource",
    model=pytorch_model,
    inputs=sagemaker.inputs.CreateModelInput(instance_type="ml.m5.large")
)

In [371]:
# Step 6: Register the model.


step_register = RegisterModel(
    name="RegisterBirdclefModel",
    model=pytorch_model,
    model_package_group_name="BirdclefModelPackageGroup",
    content_types=["application/json"],
    response_types=["application/json"],
    inference_instances=["ml.m5.large"],
    transform_instances=["ml.m5.large"],
    approval_status="Approved",
)

In [372]:
# Step 7: Deploy the Model so it can be used in production.

lambda_function_name = "sagemaker-deploy-model-lambda"

deploy_model_lambda_function = Lambda(
    function_name=lambda_function_name,
    execution_role_arn=sagemaker_role,
    script="pipeline/lambda_endpoint_deployer.py",
    handler="lambda_endpoint_deployer.lambda_handler",
)

lambda_step_inputs = {
    "endpoint_name": "birdclef-endpoint",
    "model_name": step_create_model.properties.ModelName,
    "instance_type": "ml.m5.large",
}

lambda_step = LambdaStep(
    name="DeployModelLambdaStep",
    lambda_func=deploy_model_lambda_function,
    inputs=lambda_step_inputs,
)


In [373]:
# Create the Pipeline


pipeline = Pipeline(
    name=pipeline_name,
    parameters=[],
    steps=[step_spectro, step_manifest, step_train, step_package, step_create_model, step_register, lambda_step],
    sagemaker_session=session
)
pipeline.upsert(role_arn=sagemaker_role)

{'PipelineArn': 'arn:aws:sagemaker:us-east-1:084375543672:pipeline/cs401FinalBirdclefPipeline',
 'ResponseMetadata': {'RequestId': '0e6a74ac-5451-4644-b9f1-5005decf5047',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '0e6a74ac-5451-4644-b9f1-5005decf5047',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '94',
   'date': 'Thu, 24 Apr 2025 18:02:50 GMT'},
  'RetryAttempts': 0}}

In [363]:
# Run the Pipeline

pipeline.start()

_PipelineExecution(arn='arn:aws:sagemaker:us-east-1:084375543672:pipeline/cs401FinalBirdclefPipeline/execution/k0l3rxytv2ci', sagemaker_session=<sagemaker.session.Session object at 0x7f78316a9a00>)