# モジュールインポート

In [1]:
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.session import Session
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.workflow.parameters import ParameterString
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo
from sagemaker.workflow.functions import JsonGet
from sagemaker.workflow.properties import PropertyFile
import os
import pandas as pd

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


# 各種設定

## セッションとロールの設定

In [2]:
# セッションとロールの設定
sagemaker_session = sagemaker.session.Session()
role = get_execution_role()
region = sagemaker_session.boto_region_name
default_bucket = sagemaker_session.default_bucket()

# パイプラインセッションの作成
pipeline_session = PipelineSession()

## パラメータの定義

In [3]:
# パラメータの定義
processing_instance_type = ParameterString(name="ProcessingInstanceType", default_value="ml.m5.xlarge")
training_instance_type = ParameterString(name="TrainingInstanceType", default_value="ml.m5.xlarge")
model_approval_status = ParameterString(name="ModelApprovalStatus", default_value="PendingManualApproval")

## S3パスの設定

In [4]:
# データのS3パスの定義
input_data_uri = f's3://{default_bucket}/lightgbm-pipeline/input/data.csv'
output_data_uri = f's3://{default_bucket}/lightgbm-pipeline/output'

## サンプルデータの作成

In [5]:
# サンプルデータの作成とS3へのアップロード
from sklearn.datasets import make_classification

X, y = make_classification(
    n_samples=1000, n_features=20, n_informative=15, n_redundant=5, n_classes=2, random_state=42
)
df = pd.DataFrame(X)
df['target'] = y

# データをS3にアップロード
os.makedirs('data', exist_ok=True)
df.to_csv('data/data.csv', index=False)
sagemaker_session.upload_data(path='data/data.csv', bucket=default_bucket, key_prefix='lightgbm-pipeline/input')

's3://sagemaker-ap-northeast-1-706711397653/lightgbm-pipeline/input/data.csv'

# パイプラインの作成

## Preprocess

In [29]:
# 前処理ステップの定義
script_processor = ScriptProcessor(
    image_uri=sagemaker.image_uris.retrieve(framework='sklearn', 
                                            region=region, 
                                            version='0.23-1'),
    command=['python3'],
    instance_type=processing_instance_type,
    instance_count=1,
    base_job_name='lightgbm-preprocessing',
    role=role,
    sagemaker_session=pipeline_session  # PipelineSessionを使用
)

processing_step = ProcessingStep(
    name='Preprocessing',
    processor=script_processor,
    inputs=[],
    outputs=[
        ProcessingOutput(
            output_name='train_data',
            source='/opt/ml/processing/train',
            destination=f's3://{default_bucket}/lightgbm-pipeline/output/train'
        ),
        ProcessingOutput(
            output_name='validation_data',
            source='/opt/ml/processing/validation',
            destination=f's3://{default_bucket}/lightgbm-pipeline/output/validation'
        ),
    ],
    code='preprocessing.py',
    job_arguments=[
        "--input-bucket", default_bucket,
        "--input-key", "lightgbm-pipeline/input/data.csv"
    ]
)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker.image_uris:Defaulting to only supported image scope: cpu.


## Train

In [30]:
# 学習ステップの定義
sklearn_estimator = SKLearn(
    entry_point='train.py',
    role=role,
    instance_type=training_instance_type,
    instance_count=1,
    framework_version='0.23-1',
    base_job_name='lightgbm-training',
    sagemaker_session=pipeline_session,  # PipelineSessionを使用
    hyperparameters={
        'n-trials': 50,  # Optunaの試行回数
    },
)

training_step = TrainingStep(
    name='Training',
    estimator=sklearn_estimator,
    inputs={
        'train': sagemaker.inputs.TrainingInput(
            s3_data=processing_step.properties.ProcessingOutputConfig.Outputs['train_data'].S3Output.S3Uri,
            content_type='text/csv'
        ),
        'validation': sagemaker.inputs.TrainingInput(
            s3_data=processing_step.properties.ProcessingOutputConfig.Outputs['validation_data'].S3Output.S3Uri,
            content_type='text/csv'
        ),
    },
)



## evaluate

In [31]:
evaluation_processor = ScriptProcessor(
    image_uri=script_processor.image_uri,
    command=['python3'],
    instance_type=processing_instance_type,
    instance_count=1,
    base_job_name='lightgbm-evaluation',
    role=role,
    sagemaker_session=pipeline_session  # PipelineSessionを使用
)

evaluation_report = PropertyFile(
    name='EvaluationReport',
    output_name='evaluation',
    path='evaluation.json',
)

evaluation_step = ProcessingStep(
    name='Evaluation',
    processor=evaluation_processor,
    inputs=[
        ProcessingInput(
            source=training_step.properties.ModelArtifacts.S3ModelArtifacts,
            destination='/opt/ml/model'
        ),
        ProcessingInput(
            source=processing_step.properties.ProcessingOutputConfig.Outputs['validation_data'].S3Output.S3Uri,
            destination='/opt/ml/processing/validation'
        ),
    ],
    outputs=[
        ProcessingOutput(
            output_name='evaluation',
            source='/opt/ml/processing/evaluation',
        ),
    ],
    code='evaluate.py',
    property_files=[evaluation_report],
)

## pipelineの定義と実行

In [32]:
# パイプラインの定義
pipeline = Pipeline(
    name='LightGBM-Pipeline-ModelRegistration',
    parameters=[
        processing_instance_type,
        training_instance_type,
        model_approval_status,
    ],
    steps=[
           processing_step, 
           training_step, 
           evaluation_step, 
           # condition_step
          ],
    sagemaker_session=pipeline_session,  # PipelineSessionを使用
)

# パイプラインの作成と実行
pipeline.upsert(role_arn=role)
execution = pipeline.start()

