In [105]:
# IMPORTS
import sagemaker
from sagemaker.pytorch import PyTorch
from sagemaker.model import ModelPackage
from sagemaker import get_execution_role, Session, image_uris
from sagemaker.workflow.functions import Join
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.parameters import ParameterString, ParameterInteger, ParameterFloat
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.workflow.step_collections import RegisterModel
from sagemaker.workflow.model_step import ModelStep
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
from sagemaker.pytorch.processing import PyTorchProcessor
from sagemaker.model_monitor import DefaultModelMonitor
from sagemaker.workflow.check_job_config import CheckJobConfig
from sagemaker.workflow.monitor_batch_transform_step import MonitorBatchTransformStep

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import timm
import os
import boto3
from io import BytesIO
from io import StringIO
import warnings

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
import torchvision.transforms as transforms
warnings.filterwarnings('ignore')

In [106]:
# PIPELINE INFORMATION

session = sagemaker.Session()
role = get_execution_role()

sagemaker_role = sagemaker.get_execution_role()

pipeline_name = "cs401FinalBirdclefPipeline"

In [107]:
# PRE-PROCESSING STEP
spectro_processor = ScriptProcessor(
    image_uri=image_uris.retrieve("pytorch","us-east-1","2.0.0","py310",
                                  instance_type="ml.m5.large", image_scope="inference"),
    command=["python3"],
    role=role,
    instance_type="ml.m5.large",
    instance_count=1,
    base_job_name="audio-processing",
    sagemaker_session=session,
)

step_spectro = ProcessingStep(
    name="CreateSpectrograms",
    processor=spectro_processor,
    code="audio_pipeline.py",
    inputs=[
        ProcessingInput(
            source="s3://cs401finalpipelineprocessingdata/data/clean_audio/",
            destination="/opt/ml/processing/input/audio",
        )
    ],
    outputs=[
        ProcessingOutput(
            output_name="Spectrograms",
            source="/opt/ml/processing/output/spectrograms",
        )
    ],
)

In [108]:
# MAKE MANIFEST STEP
manifest_processor = ScriptProcessor(
    image_uri=image_uris.retrieve("pytorch","us-east-1","2.0.0","py310",
                                  instance_type="ml.m5.large", image_scope="inference"),
    command=["python3"],
    role=role,
    instance_type="ml.m5.large",
    instance_count=1,
    base_job_name="build-manifests",
    sagemaker_session=session,
)

step_manifest = ProcessingStep(
    name="BuildManifests",
    processor=manifest_processor,
    code="manifest_build.py",
    inputs=[
        ProcessingInput(                                   # ← specs from step 1
            source=step_spectro.properties.ProcessingOutputConfig
                              .Outputs["Spectrograms"]
                              .S3Output.S3Uri,
            destination="/opt/ml/processing/input/specs",
            input_name="SPECS",
        ),
        ProcessingInput(                                   # train.csv
            source="s3://birdclef-2025/train.csv",
            destination="/opt/ml/processing/input/train",
            input_name="TRAINCSV",
        ),
    ],
    outputs=[
        ProcessingOutput(
            output_name="TrainManifest",
            source="/opt/ml/processing/output/train_manifest.csv"
        ),
        ProcessingOutput(
            output_name="ValManifest",
            source="/opt/ml/processing/output/val_manifest.csv"
        ),
    ],
)

In [109]:
# TRAIN STEP
pytorch_estimator = PyTorch(
    entry_point="train_script_2.0.py",
    source_dir=".",
    role=role,
    framework_version="2.0.0",
    py_version="py310",
    instance_type="ml.m5.large",
    instance_count=1,
    hyperparameters={
        "epochs":           1,
        "batch-size":       32,
        "learning-rate":    0.001,
        "accumulation-steps": 4,
    },
)

step_train = TrainingStep(
    name="TrainSpectrogramModel",
    estimator=pytorch_estimator,
    inputs={
        "audio-specs": step_spectro.properties.ProcessingOutputConfig
                                        .Outputs["Spectrograms"]
                                        .S3Output
                                        .S3Uri,
        "manifests":   step_manifest.properties.ProcessingOutputConfig
                                        .Outputs["TrainManifest"]
                                        .S3Output
                                        .S3Uri,
        "validation":  step_manifest.properties.ProcessingOutputConfig
                                        .Outputs["ValManifest"]
                                        .S3Output
                                        .S3Uri,
    },
)

In [110]:
# # TRAIN STEP
# pytorch_processor = PyTorchProcessor(
#     framework_version="1.13.1",
#     role=role,
#     py_version="py39",
#     instance_type="ml.m5.large",
#     instance_count=1,
#     base_job_name="birdclef-training-job",
#     volume_size_in_gb=100,
#     sagemaker_session=session,
#     env={"PYTHONUNBUFFERED": "1"}
# )


# s3_bucket = "cs401finalpipelineprocessingdata"
# train_manifest_uri = f"s3://{s3_bucket}/manifests/train_manifest.csv"
# val_manifest_uri = f"s3://{s3_bucket}/manifests/val_manifest.csv"
# output_uri = f"s3://{s3_bucket}/model-output/"

# pytorch_training_step = ProcessingStep(
#     name="BirdclefTraining",
#     processor=pytorch_processor,
#     code="train_script_2.0.py",
#     source_dir=".",
#     inputs=[
#         ProcessingInput(
#             source=f"s3://{s3_bucket}/data/audio_specs",
#             destination="/opt/ml/processing/data/audio_specs",
#             input_name="spectrograms"
#         ),
#         ProcessingInput(
#             source=train_manifest_uri,
#             destination="/opt/ml/processing/input/train",
#             input_name="train-data"
#         ),
#         ProcessingInput(
#             source=val_manifest_uri,
#             destination="/opt/ml/processing/input/validation",
#             input_name="val-data"
#         )
#     ],
#     outputs=[
#         ProcessingOutput(
#             output_name="model-output",
#             source="/opt/ml/processing/output",
#             destination=output_uri
#         )
#     ],
#     arguments=[
#         "--epochs", "1",
#         "--batch-size", "32",
#         "--learning-rate", "0.001",
#         "--accumulation-steps", "4"
#     ]
# )

In [111]:
# PACKAGE STEP
package_processor = ScriptProcessor(
    image_uri=image_uris.retrieve(
        framework="pytorch",
        region="us-east-1",
        version="2.0.0",
        py_version="py310",
        instance_type="ml.m5.large",
        image_scope="inference",
    ),
    command=["python3"],
    role=role,
    instance_type="ml.m5.large",
    instance_count=1,
    base_job_name="package-model",
    sagemaker_session=session,
)

step_package = ProcessingStep(
    name="PackageModel",
    processor=package_processor,
    code="package_model.py",
    inputs=[
        ProcessingInput(
            source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
            destination="/opt/ml/processing/input/model",
            input_name="MODEL",
        )
    ],
    outputs=[
        ProcessingOutput(
            output_name="RepackedModel",
            source="/opt/ml/processing/output",
        )
    ],
)

In [112]:
# # REGISTER MODEL STEP
# from sagemaker.pytorch import PyTorchModel
# model_package_group = "BirdclefSpectrogramGroup"

# model_data = Join(
#     on="/",
#     values=[
#         step_package.properties.ProcessingOutputConfig.Outputs["RepackedModel"].S3Output.S3Uri,
#         "model.tar.gz"
#     ],
# )

# # Create a model object that will be registered
# pytorch_model = PyTorchModel(
#     entry_point="inference.py",
#     model_data=model_data,
#     role=role,
#     framework_version="2.6.0",
#     py_version="py312",
#     sagemaker_session=pipeline_session,
#     code_location=f"s3://{session.default_bucket()}/model/code",  # disables repack
# )


# # Register model
# register_step = RegisterModel(
#     name="RegisterSpectrogramModel",
#     model=pytorch_model,
#     model_package_group_name=model_package_group,
#     content_types=["application/x-npy"],
#     response_types=["application/json"],
#     inference_instances=["ml.m5.large"],
#     transform_instances=["ml.m5.large"],
#     approval_status="Approved",
# )

In [113]:
# CREATE PIPELINE
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[],
    steps=[step_spectro, step_manifest, step_train, step_package],
    sagemaker_session=session
)
pipeline.upsert(role_arn=sagemaker_role)

{'PipelineArn': 'arn:aws:sagemaker:us-east-1:084375543672:pipeline/cs401FinalBirdclefPipeline',
 'ResponseMetadata': {'RequestId': '5ea744d4-50ba-463e-8148-044e753aea4b',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '5ea744d4-50ba-463e-8148-044e753aea4b',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '94',
   'date': 'Tue, 22 Apr 2025 20:42:09 GMT'},
  'RetryAttempts': 0}}

In [114]:
pipeline.start()

_PipelineExecution(arn='arn:aws:sagemaker:us-east-1:084375543672:pipeline/cs401FinalBirdclefPipeline/execution/k1dgzkuh5uk4', sagemaker_session=<sagemaker.session.Session object at 0x7f783184a5d0>)