In [283]:
# IMPORTS

import sagemaker
from sagemaker import image_uris
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.pytorch import PyTorch
from sagemaker.workflow.step_collections import RegisterModel
from sagemaker.workflow.lambda_step import LambdaStep
from sagemaker.lambda_helper import Lambda
from sagemaker.pytorch.model import PyTorchModel
from sagemaker.workflow.functions import Join
from sagemaker.model_monitor import DefaultModelMonitor
from sagemaker.model_monitor.dataset_format import DatasetFormat
from sagemaker.workflow.check_job_config import CheckJobConfig
from sagemaker.workflow.quality_check_step import DataQualityCheckConfig
from sagemaker.workflow.monitor_batch_transform_step import MonitorBatchTransformStep



In [273]:
# PIPELINE INFORMATION

session = sagemaker.Session()
sagemaker_role = sagemaker.get_execution_role()

pipeline_name = "cs401FinalBirdclefPipeline"
region = "us-east-1"

In [274]:
# PRE-PROCESSING STEP - WORKS

spectro_processor = ScriptProcessor(
    role=sagemaker_role,
    image_uri=image_uris.retrieve("pytorch", region, "2.0.0", "py310", instance_type="ml.t3.medium", image_scope="training"),
    command=['python3'],  # Explicit Python invocation
    instance_count=1,
    instance_type='ml.t3.medium',
    base_job_name='audio-processing',
    sagemaker_session=session
)

step_spectro = ProcessingStep(
    name="CreateSpectrograms",
    processor=spectro_processor,
    code="audio_pipeline.py"
)


In [275]:
# MAKE MANIFEST STEP

manifest_processor = ScriptProcessor(
    image_uri=image_uris.retrieve("pytorch", region, "2.0.0", "py310", instance_type="ml.t3.medium", image_scope="inference"),
    command=["python3"],
    role=sagemaker_role,
    instance_type="ml.t3.medium",
    instance_count=1,
    base_job_name="build-manifests",
    sagemaker_session=session,
)

step_manifest = ProcessingStep(
    name="BuildManifests",
    processor=manifest_processor,
    code="manifest_build.py",
    job_arguments=[
        "--specs-s3-prefix", "s3://cs401finalpipelineprocessingdata/data/audio_specs/",
        "--train-csv-s3-uri", "/opt/ml/processing/input/train/train.csv",
        "--output-dir", "/opt/ml/processing/output"
    ],
    inputs=[
        ProcessingInput(
            source="s3://cs401finalpipelineprocessingdata/data/audio_specs/",
            destination="/opt/ml/processing/input/specs",
            input_name="SPECS",
        ),
        ProcessingInput(
            source="s3://cs401finalpipelineinput/train.csv",
            destination="/opt/ml/processing/input/train",
            input_name="TRAINCSV",
        ),
    ],
    outputs=[
        ProcessingOutput(
            output_name="Manifests",
            source="/opt/ml/processing/output",
        ),
    ],
)

In [276]:
# TRAIN STEP

pytorch_estimator = PyTorch(
    entry_point="train_script_2.0.py",
    source_dir=".",
    role=sagemaker_role,
    framework_version="2.6.0",
    py_version="py312",
    instance_type="ml.m5.xlarge",
    instance_count=1,
    hyperparameters={
        "epochs": 1,
        "batch-size": 32,
        "learning-rate": 0.001,
        "accumulation-steps": 4,
        "train-manifest": "/opt/ml/input/data/manifests/train_manifest.csv",
        "val-manifest": "/opt/ml/input/data/manifests/val_manifest.csv",
    },
    dependencies=['model_dependencies/requirements.txt']
)

step_train = TrainingStep(
    name="TrainSpectrogramModel",
    estimator=pytorch_estimator,
    inputs={
        "specs": "s3://cs401finalpipelineprocessingdata/data/audio_specs/",
        "manifests": step_manifest.properties.ProcessingOutputConfig
                                  .Outputs["Manifests"]
                                  .S3Output.S3Uri,
    },
)

In [278]:
# PACKAGE MODEL STEP

packaging_processor = ScriptProcessor(
    image_uri=image_uris.retrieve("pytorch", region, "2.6.0", "py312", instance_type="ml.t3.medium", image_scope="inference"),
    role=sagemaker_role,
    instance_type="ml.t3.medium",
    instance_count=1,
    command=["python3"]
)

step_package = ProcessingStep(
    name="PackageModel",
    processor=packaging_processor,
    inputs=[
        ProcessingInput(
            source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
            destination="/opt/ml/processing/model"
        ),
        ProcessingInput(
            source="model_dependencies/",
            destination="/opt/ml/processing/input/dependencies",
            input_name="model_code"
        )
    ],
    outputs=[
        ProcessingOutput(
            output_name="PackagedModel",
            source="/opt/ml/processing/output"
        )
    ],
    code="package_model.py"
)

In [279]:
# REGISTER MODEL STEP AND CREATE MODEL STEP

s3_uri = Join(on="/", values=[step_package.properties.ProcessingOutputConfig.Outputs["PackagedModel"].S3Output.S3Uri, "model.tar.gz"])

pytorch_model = PyTorchModel(
    model_data=s3_uri,
    role=sagemaker_role,
    framework_version="2.6.0",
    py_version="py312",
    sagemaker_session=session,
)


step_register = RegisterModel(
    name="RegisterBirdclefModel",
    model=pytorch_model,
    model_package_group_name="BirdclefModelPackageGroup",
    content_types=["application/json"],
    response_types=["application/json"],
    inference_instances=["ml.m5.large"],
    transform_instances=["ml.m5.large"],
    approval_status="Approved",
)


In [280]:
# DEPLOY MODEL STEP

lambda_function_name = "sagemaker-deploy-model-lambda"

deploy_model_lambda_function = Lambda(
    function_name=lambda_function_name,
    execution_role_arn=sagemaker_role,
    script="lambda_endpoint_deployer.py",
    handler="lambda_endpoint_deployer.lambda_handler",
)

lambda_step_inputs = {
    "endpoint_name": "birdclef-endpoint",
    "model_data": pytorch_model.model_data,
    "instance_type": "ml.m5.large",
}

lambda_step = LambdaStep(
    name="DeployModelLambdaStep",
    lambda_func=deploy_model_lambda_function,
    inputs=lambda_step_inputs,
)


In [None]:
# DATA MONITORING STEP (To detect usage / data drift)


monitor = DefaultModelMonitor(
    role=sagemaker_role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    volume_size_in_gb=20,
    max_runtime_in_seconds=3600
)

# Configure job to check for data quality
job_config = CheckJobConfig(
    role=sagemaker_role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    volume_size_in_gb=20,
    max_runtime_in_seconds=3600
)

# Define data quality configuration
data_quality_config = DataQualityCheckConfig(
    baseline_dataset=step_train.properties.ProcessingInputConfig.Inputs["manifests"].S3Input.S3Uri,
    dataset_format=DatasetFormat.csv(header=True),
    output_s3_uri=f"s3://cs401finalpipelineprocessingdata/monitoring/output"
)





from sagemaker.transformer import Transformer

transformer = Transformer(
    model_name="birdclef-model",
    instance_count=1,
    instance_type="ml.m5.large",
    output_path=f"s3://cs401finalpipelineprocessingdata/transform/output",
    sagemaker_session=session
)

# Configure transform arguments
transform_args = transformer.transform(
    data=step_manifest.properties.ProcessingOutputConfig.Outputs["Manifests"].S3Output.S3Uri,
    content_type="text/csv",
    split_type="Line"
)




# Create the monitoring step
monitor_step = MonitorBatchTransformStep(
    name="BirdclefModelMonitoring",
    transform_step_args=transform_args,
    monitor_configuration=data_quality_config,
    check_job_configuration=job_config,
    monitor_before_transform=True,  # Monitor data before processing
    fail_on_violation=False,        # Continue even if violations detected
)





In [281]:


# 1. STEP_SPECTRO WORKS
# 2. STEP_MANIFEST WORKS
# 3. STEP_TRAIN WORKS
# 4. STEP_PACKAGE WORKS

# CREATE PIPELINE
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[],
    steps=[step_spectro, step_manifest, step_train, step_package, step_register, lambda_step],
    sagemaker_session=session
)
pipeline.upsert(role_arn=sagemaker_role)

{'PipelineArn': 'arn:aws:sagemaker:us-east-1:084375543672:pipeline/cs401FinalBirdclefPipeline',
 'ResponseMetadata': {'RequestId': 'b5a3c8dc-0e5a-4944-af1c-bb08531c7223',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'b5a3c8dc-0e5a-4944-af1c-bb08531c7223',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '94',
   'date': 'Thu, 24 Apr 2025 01:13:12 GMT'},
  'RetryAttempts': 0}}

In [282]:
pipeline.start()

_PipelineExecution(arn='arn:aws:sagemaker:us-east-1:084375543672:pipeline/cs401FinalBirdclefPipeline/execution/fs8nsk72hqsu', sagemaker_session=<sagemaker.session.Session object at 0x7f78494ad010>)