In [3]:
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.session import Session
from sagemaker.workflow.parameters import ParameterString
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo
from sagemaker.workflow.functions import JsonGet
from sagemaker.workflow.model_step import ModelStep
from sagemaker.workflow.properties import PropertyFile
from sagemaker.model import Model
import os
import pandas as pd

# セッションとロールの設定
sagemaker_session = sagemaker.session.Session()
role = get_execution_role()
region = sagemaker_session.boto_region_name
default_bucket = sagemaker_session.default_bucket()

# パラメータの定義
processing_instance_type = ParameterString(name="ProcessingInstanceType", default_value="ml.m5.xlarge")
training_instance_type = ParameterString(name="TrainingInstanceType", default_value="ml.m5.xlarge")
model_approval_status = ParameterString(name="ModelApprovalStatus", default_value="Approved")

# データのS3パスの定義
input_data_uri = f's3://{default_bucket}/lightgbm-pipeline/input/data.csv'
output_data_uri = f's3://{default_bucket}/lightgbm-pipeline/output'

# サンプルデータの作成とS3へのアップロード
from sklearn.datasets import make_classification

X, y = make_classification(
    n_samples=1000, n_features=20, n_informative=15, n_redundant=5, n_classes=2, random_state=42
)
df = pd.DataFrame(X)
df['target'] = y

# データをS3にアップロード
os.makedirs('data', exist_ok=True)
df.to_csv('data/data.csv', index=False)
sagemaker_session.upload_data(path='data/data.csv', bucket=default_bucket, key_prefix='lightgbm-pipeline/input')

# 前処理スクリプトの作成
processing_script = """
import argparse
import os
import pandas as pd
import boto3
import logging
from sklearn.model_selection import train_test_split
from io import StringIO

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

if __name__ == "__main__":
    logger.info("前処理を開始します。")
    parser = argparse.ArgumentParser()
    parser.add_argument("--input-bucket", type=str)
    parser.add_argument("--input-key", type=str)
    args = parser.parse_args()

    logger.info("S3からデータを読み込んでいます。")
    s3 = boto3.client('s3')
    response = s3.get_object(Bucket=args.input_bucket, Key=args.input_key)
    df = pd.read_csv(StringIO(response['Body'].read().decode('utf-8')))

    logger.info(f"入力データの形状: {df.shape}")

    logger.info("データを訓練セットと検証セットに分割します。")
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

    os.makedirs('/opt/ml/processing/train', exist_ok=True)
    os.makedirs('/opt/ml/processing/validation', exist_ok=True)

    logger.info("訓練データを保存します。")
    train_df.to_csv('/opt/ml/processing/train/train.csv', index=False)

    logger.info("検証データを保存します。")
    val_df.to_csv('/opt/ml/processing/validation/validation.csv', index=False)

    logger.info("前処理が完了しました。")
"""
with open('preprocessing.py', 'w') as f:
    f.write(processing_script)

# 前処理ステップの定義
script_processor = ScriptProcessor(
    image_uri=sagemaker.image_uris.retrieve(framework='sklearn', region=region, version='0.23-1'),
    command=['python3'],
    instance_type=processing_instance_type,
    instance_count=1,
    base_job_name='lightgbm-preprocessing',
    role=role,
    sagemaker_session=sagemaker_session
)

processing_step = ProcessingStep(
    name='Preprocessing',
    processor=script_processor,
    inputs=[],
    outputs=[
        ProcessingOutput(
            output_name='train_data',
            source='/opt/ml/processing/train',
            destination=f's3://{default_bucket}/lightgbm-pipeline/output/train'
        ),
        ProcessingOutput(
            output_name='validation_data',
            source='/opt/ml/processing/validation',
            destination=f's3://{default_bucket}/lightgbm-pipeline/output/validation'
        ),
    ],
    code='preprocessing.py',
    job_arguments=[
        "--input-bucket", default_bucket,
        "--input-key", "lightgbm-pipeline/input/data.csv"
    ]
)

# 学習スクリプトの作成
training_script = """
import argparse
import os
import subprocess
import sys

# 必要なパッケージをインストール
subprocess.check_call([sys.executable, "-m", "pip", "install", "lightgbm", "optuna"])

import pandas as pd
import lightgbm as lgb
import optuna
from sklearn.metrics import roc_auc_score

def objective(trial, X_train, y_train, X_val, y_val):
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-4, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-4, 10.0),
    }

    train_dataset = lgb.Dataset(X_train, label=y_train)
    val_dataset = lgb.Dataset(X_val, label=y_val)

    model = lgb.train(params, train_dataset, valid_sets=[val_dataset], early_stopping_rounds=10, verbose_eval=False)
    y_pred = model.predict(X_val)
    auc = roc_auc_score(y_val, y_pred)
    return auc

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--n-trials", type=int, default=20)
    args = parser.parse_args()

    print("訓練データを読み込んでいます。")
    train_df = pd.read_csv('/opt/ml/input/data/train/train.csv')

    print("検証データを読み込んでいます。")
    val_df = pd.read_csv('/opt/ml/input/data/validation/validation.csv')

    y_train = train_df.pop('target')
    X_train = train_df
    y_val = val_df.pop('target')
    X_val = val_df

    # Optunaによるハイパーパラメータ最適化
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X_train, y_train, X_val, y_val), n_trials=args.n_trials)

    print('Best trial:')
    trial = study.best_trial
    print('  AUC: {}'.format(trial.value))
    print('  Params: ')
    for key, value in trial.params.items():
        print('    {}: {}'.format(key, value))

    # 最適なハイパーパラメータでモデルを再学習
    best_params = trial.params
    best_params['objective'] = 'binary'
    best_params['metric'] = 'auc'
    best_params['verbosity'] = -1
    best_params['boosting_type'] = 'gbdt'

    train_dataset = lgb.Dataset(X_train, label=y_train)
    val_dataset = lgb.Dataset(X_val, label=y_val)

    model = lgb.train(best_params, train_dataset, valid_sets=[val_dataset], early_stopping_rounds=10)

    # モデルを保存
    model_dir = os.environ.get('SM_MODEL_DIR')
    model.save_model(os.path.join(model_dir, "model.txt"))
"""
with open('train.py', 'w') as f:
    f.write(training_script)

# 学習ステップの定義
sklearn_estimator = SKLearn(
    entry_point='train.py',
    role=role,
    instance_type=training_instance_type,
    instance_count=1,
    framework_version='0.23-1',
    base_job_name='lightgbm-training',
    sagemaker_session=sagemaker_session,
    hyperparameters={
        'n-trials': 50,  # Optunaの試行回数
    },
)

training_step = TrainingStep(
    name='Training',
    estimator=sklearn_estimator,
    inputs={
        'train': sagemaker.inputs.TrainingInput(
            s3_data=processing_step.properties.ProcessingOutputConfig.Outputs['train_data'].S3Output.S3Uri,
            content_type='text/csv'
        ),
        'validation': sagemaker.inputs.TrainingInput(
            s3_data=processing_step.properties.ProcessingOutputConfig.Outputs['validation_data'].S3Output.S3Uri,
            content_type='text/csv'
        ),
    },
)

# 評価スクリプトの作成
evaluation_script = """
import argparse
import os
import subprocess
import sys

subprocess.check_call([sys.executable, "-m", "pip", "install", "lightgbm"])

import pandas as pd
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
import json

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--output-dir", type=str, default="/opt/ml/processing/evaluation")
    args = parser.parse_args()

    print("モデルを読み込んでいます。")
    model = lgb.Booster(model_file=os.path.join('/opt/ml/processing/model', 'model.txt'))

    print("検証データを読み込んでいます。")
    val_df = pd.read_csv('/opt/ml/processing/validation/validation.csv')
    y_val = val_df.pop('target')
    X_val = val_df

    y_pred = model.predict(X_val)
    auc = roc_auc_score(y_val, y_pred)

    report_dict = {
        "binary_classification_metrics": {
            "auc": {
                "value": auc
            }
        }
    }

    os.makedirs(args.output_dir, exist_ok=True)
    evaluation_path = os.path.join(args.output_dir, "evaluation.json")
    with open(evaluation_path, "w") as f:
        json.dump(report_dict, f)
"""
with open('evaluate.py', 'w') as f:
    f.write(evaluation_script)

# 評価ステップの定義
evaluation_processor = ScriptProcessor(
    image_uri=script_processor.image_uri,
    command=['python3'],
    instance_type=processing_instance_type,
    instance_count=1,
    base_job_name='lightgbm-evaluation',
    role=role,
    sagemaker_session=sagemaker_session
)

evaluation_report = PropertyFile(
    name='EvaluationReport',
    output_name='evaluation',
    path='evaluation.json',
)

evaluation_step = ProcessingStep(
    name='Evaluation',
    processor=evaluation_processor,
    inputs=[
        ProcessingInput(
            source=training_step.properties.ModelArtifacts.S3ModelArtifacts,
            destination='/opt/ml/processing/model'
        ),
        ProcessingInput(
            source=processing_step.properties.ProcessingOutputConfig.Outputs['validation_data'].S3Output.S3Uri,
            destination='/opt/ml/processing/validation'
        ),
    ],
    outputs=[
        ProcessingOutput(
            output_name='evaluation',
            source='/opt/ml/processing/evaluation',
        ),
    ],
    code='evaluate.py',
    property_files=[evaluation_report],
)

# 推論スクリプトの作成
inference_script = """
import os
import sys
import numpy as np
import pandas as pd
import lightgbm as lgb
from io import StringIO

def model_fn(model_dir):
    print("モデルを読み込んでいます。")
    model = lgb.Booster(model_file=os.path.join(model_dir, 'model.txt'))
    return model

def input_fn(request_body, content_type):
    if content_type == 'text/csv':
        return pd.read_csv(StringIO(request_body), header=None)
    else:
        raise ValueError("Unsupported content type: {}".format(content_type))

def predict_fn(input_data, model):
    predictions = model.predict(input_data)
    return predictions

def output_fn(prediction, accept):
    if accept == 'text/csv':
        return ','.join(map(str, prediction.tolist()))
    else:
        raise ValueError("Unsupported accept type: {}".format(accept))
"""
with open('inference.py', 'w') as f:
    f.write(inference_script)

# モデルステップの定義（モデルの登録）
model = Model(
    image_uri=sklearn_estimator.training_image_uri(),
    model_data=training_step.properties.ModelArtifacts.S3ModelArtifacts,
    role=role,
    entry_point='inference.py',
    sagemaker_session=sagemaker_session,
)

model_step = ModelStep(
    name='RegisterModel',
    step_args=model.create(instance_type='ml.m5.large'),
)

# 条件ステップの定義
cond_gte = ConditionGreaterThanOrEqualTo(
    left=JsonGet(
        step_name=evaluation_step.name,
        property_file=evaluation_report,
        json_path="binary_classification_metrics.auc.value",
    ),
    right=0.8,
)

condition_step = ConditionStep(
    name='AUCConditionCheck',
    conditions=[cond_gte],
    if_steps=[model_step],
    else_steps=[],
)

# パイプラインの定義
pipeline = Pipeline(
    name='LightGBM-Pipeline-ModelRegistration',
    parameters=[
        processing_instance_type,
        training_instance_type,
        model_approval_status,
    ],
    steps=[processing_step, training_step, evaluation_step, condition_step],
    sagemaker_session=sagemaker_session,
)

# パイプラインの作成と実行
pipeline.upsert(role_arn=role)
execution = pipeline.start()


INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker.image_uris:Defaulting to only supported image scope: cpu.


AttributeError: 'Properties' object has no attribute 'decode'

In [8]:
pip install --upgrade sagemaker
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.session import Session
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.workflow.parameters import ParameterString
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo
from sagemaker.workflow.functions import JsonGet
from sagemaker.workflow.model_step import RegisterModel
from sagemaker.workflow.properties import PropertyFile
import os
import pandas as pd

# セッションとロールの設定
sagemaker_session = sagemaker.session.Session()
role = get_execution_role()
region = sagemaker_session.boto_region_name
default_bucket = sagemaker_session.default_bucket()

# パイプラインセッションの作成
pipeline_session = PipelineSession()

# パラメータの定義
processing_instance_type = ParameterString(name="ProcessingInstanceType", default_value="ml.m5.xlarge")
training_instance_type = ParameterString(name="TrainingInstanceType", default_value="ml.m5.xlarge")
model_approval_status = ParameterString(name="ModelApprovalStatus", default_value="PendingManualApproval")

# データのS3パスの定義
input_data_uri = f's3://{default_bucket}/lightgbm-pipeline/input/data.csv'
output_data_uri = f's3://{default_bucket}/lightgbm-pipeline/output'

# サンプルデータの作成とS3へのアップロード
from sklearn.datasets import make_classification

X, y = make_classification(
    n_samples=1000, n_features=20, n_informative=15, n_redundant=5, n_classes=2, random_state=42
)
df = pd.DataFrame(X)
df['target'] = y

# データをS3にアップロード
os.makedirs('data', exist_ok=True)
df.to_csv('data/data.csv', index=False)
sagemaker_session.upload_data(path='data/data.csv', bucket=default_bucket, key_prefix='lightgbm-pipeline/input')

# 前処理スクリプトの作成
processing_script = """
import argparse
import os
import pandas as pd
import boto3
import logging
from sklearn.model_selection import train_test_split
from io import StringIO

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

if __name__ == "__main__":
    logger.info("前処理を開始します。")
    parser = argparse.ArgumentParser()
    parser.add_argument("--input-bucket", type=str)
    parser.add_argument("--input-key", type=str)
    args = parser.parse_args()

    logger.info("S3からデータを読み込んでいます。")
    s3 = boto3.client('s3')
    response = s3.get_object(Bucket=args.input_bucket, Key=args.input_key)
    df = pd.read_csv(StringIO(response['Body'].read().decode('utf-8')))

    logger.info(f"入力データの形状: {df.shape}")

    logger.info("データを訓練セットと検証セットに分割します。")
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

    os.makedirs('/opt/ml/processing/train', exist_ok=True)
    os.makedirs('/opt/ml/processing/validation', exist_ok=True)

    logger.info("訓練データを保存します。")
    train_df.to_csv('/opt/ml/processing/train/train.csv', index=False)

    logger.info("検証データを保存します。")
    val_df.to_csv('/opt/ml/processing/validation/validation.csv', index=False)

    logger.info("前処理が完了しました。")
"""
with open('preprocessing.py', 'w') as f:
    f.write(processing_script)

# 前処理ステップの定義
script_processor = ScriptProcessor(
    image_uri=sagemaker.image_uris.retrieve(framework='sklearn', region=region, version='0.23-1'),
    command=['python3'],
    instance_type=processing_instance_type,
    instance_count=1,
    base_job_name='lightgbm-preprocessing',
    role=role,
    sagemaker_session=pipeline_session  # PipelineSessionを使用
)

processing_step = ProcessingStep(
    name='Preprocessing',
    processor=script_processor,
    inputs=[],
    outputs=[
        ProcessingOutput(
            output_name='train_data',
            source='/opt/ml/processing/train',
            destination=f's3://{default_bucket}/lightgbm-pipeline/output/train'
        ),
        ProcessingOutput(
            output_name='validation_data',
            source='/opt/ml/processing/validation',
            destination=f's3://{default_bucket}/lightgbm-pipeline/output/validation'
        ),
    ],
    code='preprocessing.py',
    job_arguments=[
        "--input-bucket", default_bucket,
        "--input-key", "lightgbm-pipeline/input/data.csv"
    ]
)

# 学習スクリプトの作成
training_script = """
import argparse
import os
import subprocess
import sys

# 必要なパッケージをインストール
subprocess.check_call([sys.executable, "-m", "pip", "install", "lightgbm", "optuna"])

import pandas as pd
import lightgbm as lgb
import optuna
from sklearn.metrics import roc_auc_score

def objective(trial, X_train, y_train, X_val, y_val):
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-4, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-4, 10.0),
    }

    train_dataset = lgb.Dataset(X_train, label=y_train)
    val_dataset = lgb.Dataset(X_val, label=y_val)

    model = lgb.train(params, train_dataset, valid_sets=[val_dataset], early_stopping_rounds=10, verbose_eval=False)
    y_pred = model.predict(X_val)
    auc = roc_auc_score(y_val, y_pred)
    return auc

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--n-trials", type=int, default=20)
    args = parser.parse_args()

    print("訓練データを読み込んでいます。")
    train_df = pd.read_csv('/opt/ml/input/data/train/train.csv')

    print("検証データを読み込んでいます。")
    val_df = pd.read_csv('/opt/ml/input/data/validation/validation.csv')

    y_train = train_df.pop('target')
    X_train = train_df
    y_val = val_df.pop('target')
    X_val = val_df

    # Optunaによるハイパーパラメータ最適化
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X_train, y_train, X_val, y_val), n_trials=args.n_trials)

    print('Best trial:')
    trial = study.best_trial
    print('  AUC: {}'.format(trial.value))
    print('  Params: ')
    for key, value in trial.params.items():
        print('    {}: {}'.format(key, value))

    # 最適なハイパーパラメータでモデルを再学習
    best_params = trial.params
    best_params['objective'] = 'binary'
    best_params['metric'] = 'auc'
    best_params['verbosity'] = -1
    best_params['boosting_type'] = 'gbdt'

    train_dataset = lgb.Dataset(X_train, label=y_train)
    val_dataset = lgb.Dataset(X_val, label=y_val)

    model = lgb.train(best_params, train_dataset, valid_sets=[val_dataset], early_stopping_rounds=10)

    # モデルを保存
    model_dir = os.environ.get('SM_MODEL_DIR')
    model.save_model(os.path.join(model_dir, "model.txt"))
"""
with open('train.py', 'w') as f:
    f.write(training_script)

# 学習ステップの定義
sklearn_estimator = SKLearn(
    entry_point='train.py',
    role=role,
    instance_type=training_instance_type,
    instance_count=1,
    framework_version='0.23-1',
    base_job_name='lightgbm-training',
    sagemaker_session=pipeline_session,  # PipelineSessionを使用
    hyperparameters={
        'n-trials': 50,  # Optunaの試行回数
    },
)

training_step = TrainingStep(
    name='Training',
    estimator=sklearn_estimator,
    inputs={
        'train': sagemaker.inputs.TrainingInput(
            s3_data=processing_step.properties.ProcessingOutputConfig.Outputs['train_data'].S3Output.S3Uri,
            content_type='text/csv'
        ),
        'validation': sagemaker.inputs.TrainingInput(
            s3_data=processing_step.properties.ProcessingOutputConfig.Outputs['validation_data'].S3Output.S3Uri,
            content_type='text/csv'
        ),
    },
)

# 評価スクリプトの作成
evaluation_script = """
import argparse
import os
import subprocess
import sys

subprocess.check_call([sys.executable, "-m", "pip", "install", "lightgbm"])

import pandas as pd
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
import json

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--output-dir", type=str, default="/opt/ml/processing/evaluation")
    args = parser.parse_args()

    print("モデルを読み込んでいます。")
    model = lgb.Booster(model_file=os.path.join('/opt/ml/processing/model', 'model.txt'))

    print("検証データを読み込んでいます。")
    val_df = pd.read_csv('/opt/ml/processing/validation/validation.csv')
    y_val = val_df.pop('target')
    X_val = val_df

    y_pred = model.predict(X_val)
    auc = roc_auc_score(y_val, y_pred)

    report_dict = {
        "binary_classification_metrics": {
            "auc": {
                "value": auc
            }
        }
    }

    os.makedirs(args.output_dir, exist_ok=True)
    evaluation_path = os.path.join(args.output_dir, "evaluation.json")
    with open(evaluation_path, "w") as f:
        json.dump(report_dict, f)
"""
with open('evaluate.py', 'w') as f:
    f.write(evaluation_script)

# 評価ステップの定義
evaluation_processor = ScriptProcessor(
    image_uri=script_processor.image_uri,
    command=['python3'],
    instance_type=processing_instance_type,
    instance_count=1,
    base_job_name='lightgbm-evaluation',
    role=role,
    sagemaker_session=pipeline_session  # PipelineSessionを使用
)

evaluation_report = PropertyFile(
    name='EvaluationReport',
    output_name='evaluation',
    path='evaluation.json',
)

evaluation_step = ProcessingStep(
    name='Evaluation',
    processor=evaluation_processor,
    inputs=[
        ProcessingInput(
            source=training_step.properties.ModelArtifacts.S3ModelArtifacts,
            destination='/opt/ml/processing/model'
        ),
        ProcessingInput(
            source=processing_step.properties.ProcessingOutputConfig.Outputs['validation_data'].S3Output.S3Uri,
            destination='/opt/ml/processing/validation'
        ),
    ],
    outputs=[
        ProcessingOutput(
            output_name='evaluation',
            source='/opt/ml/processing/evaluation',
        ),
    ],
    code='evaluate.py',
    property_files=[evaluation_report],
)

# モデルパッケージグループ名の定義
model_package_group_name = 'LightGBMModelPackageGroup'

# モデル登録ステップの定義
register_model_step = RegisterModel(
    name='RegisterModel',
    estimator=sklearn_estimator,
    model_data=training_step.properties.ModelArtifacts.S3ModelArtifacts,
    content_types=['text/csv'],
    response_types=['text/csv'],
    inference_instances=['ml.m5.large'],
    transform_instances=['ml.m5.large'],
    model_package_group_name=model_package_group_name,
    approval_status=model_approval_status,
    model_metrics={
        "ModelQuality": {
            "Statistics": {
                "ContentType": "application/json",
                "S3Uri": evaluation_step.properties.ProcessingOutputConfig.Outputs["evaluation"].S3Output.S3Uri
            }
        }
    },
)

# 条件ステップの定義
cond_gte = ConditionGreaterThanOrEqualTo(
    left=JsonGet(
        step_name=evaluation_step.name,
        property_file=evaluation_report,
        json_path="binary_classification_metrics.auc.value",
    ),
    right=0.8,
)

condition_step = ConditionStep(
    name='AUCConditionCheck',
    conditions=[cond_gte],
    if_steps=[register_model_step],
    else_steps=[],
)

# パイプラインの定義
pipeline = Pipeline(
    name='LightGBM-Pipeline-ModelRegistration',
    parameters=[
        processing_instance_type,
        training_instance_type,
        model_approval_status,
    ],
    steps=[processing_step, training_step, evaluation_step, condition_step],
    sagemaker_session=pipeline_session,  # PipelineSessionを使用
)

# パイプラインの作成と実行
pipeline.upsert(role_arn=role)
execution = pipeline.start()


SyntaxError: invalid syntax (3792328313.py, line 1)

In [5]:
from sagemaker.workflow.model_step import ModelStep
from sagemaker.model import Model
from sagemaker.serverless import ServerlessInferenceConfig


In [6]:
pip install --upgrade sagemaker

Note: you may need to restart the kernel to use updated packages.


In [21]:
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.session import Session
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.workflow.parameters import ParameterString
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo
from sagemaker.workflow.functions import JsonGet
# from sagemaker.workflow.model_step import RegisterModel
from sagemaker.workflow.properties import PropertyFile
import os
import pandas as pd

# セッションとロールの設定
sagemaker_session = sagemaker.session.Session()
role = get_execution_role()
region = sagemaker_session.boto_region_name
default_bucket = sagemaker_session.default_bucket()

# パイプラインセッションの作成
pipeline_session = PipelineSession()

# パラメータの定義
processing_instance_type = ParameterString(name="ProcessingInstanceType", default_value="ml.m5.xlarge")
training_instance_type = ParameterString(name="TrainingInstanceType", default_value="ml.m5.xlarge")
model_approval_status = ParameterString(name="ModelApprovalStatus", default_value="PendingManualApproval")

# データのS3パスの定義
input_data_uri = f's3://{default_bucket}/lightgbm-pipeline/input/data.csv'
output_data_uri = f's3://{default_bucket}/lightgbm-pipeline/output'

# サンプルデータの作成とS3へのアップロード
from sklearn.datasets import make_classification

X, y = make_classification(
    n_samples=1000, n_features=20, n_informative=15, n_redundant=5, n_classes=2, random_state=42
)
df = pd.DataFrame(X)
df['target'] = y

# データをS3にアップロード
os.makedirs('data', exist_ok=True)
df.to_csv('data/data.csv', index=False)
sagemaker_session.upload_data(path='data/data.csv', bucket=default_bucket, key_prefix='lightgbm-pipeline/input')

# 前処理スクリプトの作成
processing_script = """
import argparse
import os
import pandas as pd
import boto3
import logging
from sklearn.model_selection import train_test_split
from io import StringIO

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

if __name__ == "__main__":
    logger.info("前処理を開始します。")
    parser = argparse.ArgumentParser()
    parser.add_argument("--input-bucket", type=str)
    parser.add_argument("--input-key", type=str)
    args = parser.parse_args()

    logger.info("S3からデータを読み込んでいます。")
    s3 = boto3.client('s3')
    response = s3.get_object(Bucket=args.input_bucket, Key=args.input_key)
    df = pd.read_csv(StringIO(response['Body'].read().decode('utf-8')))

    logger.info(f"入力データの形状: {df.shape}")

    logger.info("データを訓練セットと検証セットに分割します。")
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

    os.makedirs('/opt/ml/processing/train', exist_ok=True)
    os.makedirs('/opt/ml/processing/validation', exist_ok=True)

    logger.info("訓練データを保存します。")
    train_df.to_csv('/opt/ml/processing/train/train.csv', index=False)

    logger.info("検証データを保存します。")
    val_df.to_csv('/opt/ml/processing/validation/validation.csv', index=False)

    logger.info("前処理が完了しました。")
"""
with open('preprocessing.py', 'w') as f:
    f.write(processing_script)

# 前処理ステップの定義
script_processor = ScriptProcessor(
    image_uri=sagemaker.image_uris.retrieve(framework='sklearn', region=region, version='0.23-1'),
    command=['python3'],
    instance_type=processing_instance_type,
    instance_count=1,
    base_job_name='lightgbm-preprocessing',
    role=role,
    sagemaker_session=pipeline_session  # PipelineSessionを使用
)

processing_step = ProcessingStep(
    name='Preprocessing',
    processor=script_processor,
    inputs=[],
    outputs=[
        ProcessingOutput(
            output_name='train_data',
            source='/opt/ml/processing/train',
            destination=f's3://{default_bucket}/lightgbm-pipeline/output/train'
        ),
        ProcessingOutput(
            output_name='validation_data',
            source='/opt/ml/processing/validation',
            destination=f's3://{default_bucket}/lightgbm-pipeline/output/validation'
        ),
    ],
    code='preprocessing.py',
    job_arguments=[
        "--input-bucket", default_bucket,
        "--input-key", "lightgbm-pipeline/input/data.csv"
    ]
)

# 学習スクリプトの作成
training_script = """
import argparse
import os
import subprocess
import sys

# 必要なパッケージをインストール
subprocess.check_call([sys.executable, "-m", "pip", "install", "lightgbm", "optuna"])

import pandas as pd
import lightgbm as lgb
import optuna
from sklearn.metrics import roc_auc_score

def objective(trial, X_train, y_train, X_val, y_val):
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-4, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-4, 10.0),
    }

    train_dataset = lgb.Dataset(X_train, label=y_train)
    val_dataset = lgb.Dataset(X_val, label=y_val)

    model = lgb.train(params, train_dataset, valid_sets=[val_dataset],)
    y_pred = model.predict(X_val)
    auc = roc_auc_score(y_val, y_pred)
    return auc

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--n-trials", type=int, default=20)
    args = parser.parse_args()

    print("訓練データを読み込んでいます。")
    train_df = pd.read_csv('/opt/ml/input/data/train/train.csv')

    print("検証データを読み込んでいます。")
    val_df = pd.read_csv('/opt/ml/input/data/validation/validation.csv')

    y_train = train_df.pop('target')
    X_train = train_df
    y_val = val_df.pop('target')
    X_val = val_df

    # Optunaによるハイパーパラメータ最適化
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X_train, y_train, X_val, y_val), 
    n_trials=args.n_trials)

    print('Best trial:')
    trial = study.best_trial
    print('  AUC: {}'.format(trial.value))
    print('  Params: ')
    for key, value in trial.params.items():
        print('    {}: {}'.format(key, value))

    # 最適なハイパーパラメータでモデルを再学習
    best_params = trial.params
    best_params['objective'] = 'binary'
    best_params['metric'] = 'auc'
    best_params['verbosity'] = -1
    best_params['boosting_type'] = 'gbdt'

    train_dataset = lgb.Dataset(X_train, label=y_train)
    val_dataset = lgb.Dataset(X_val, label=y_val)

    model = lgb.train(best_params, train_dataset, valid_sets=[val_dataset], )

    # モデルを保存
    model_dir = os.environ.get('SM_MODEL_DIR')
    model.save_model(os.path.join(model_dir, "model.txt"))
"""
with open('train.py', 'w') as f:
    f.write(training_script)

# 学習ステップの定義
sklearn_estimator = SKLearn(
    entry_point='train.py',
    role=role,
    instance_type=training_instance_type,
    instance_count=1,
    framework_version='0.23-1',
    base_job_name='lightgbm-training',
    sagemaker_session=pipeline_session,  # PipelineSessionを使用
    hyperparameters={
        'n-trials': 50,  # Optunaの試行回数
    },
)

training_step = TrainingStep(
    name='Training',
    estimator=sklearn_estimator,
    inputs={
        'train': sagemaker.inputs.TrainingInput(
            s3_data=processing_step.properties.ProcessingOutputConfig.Outputs['train_data'].S3Output.S3Uri,
            content_type='text/csv'
        ),
        'validation': sagemaker.inputs.TrainingInput(
            s3_data=processing_step.properties.ProcessingOutputConfig.Outputs['validation_data'].S3Output.S3Uri,
            content_type='text/csv'
        ),
    },
)

# 評価スクリプトの作成
evaluation_script = """
import argparse
import os
import subprocess
import sys

subprocess.check_call([sys.executable, "-m", "pip", "install", "lightgbm"])

import pandas as pd
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
import json

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--output-dir", type=str, default="/opt/ml/processing/evaluation")
    args = parser.parse_args()

    print("モデルを読み込んでいます。")
    model = lgb.Booster(model_file=os.path.join('/opt/ml/processing/model', 'model.txt'))

    print("検証データを読み込んでいます。")
    val_df = pd.read_csv('/opt/ml/processing/validation/validation.csv')
    y_val = val_df.pop('target')
    X_val = val_df

    y_pred = model.predict(X_val)
    auc = roc_auc_score(y_val, y_pred)

    report_dict = {
        "binary_classification_metrics": {
            "auc": {
                "value": auc
            }
        }
    }

    os.makedirs(args.output_dir, exist_ok=True)
    evaluation_path = os.path.join(args.output_dir, "evaluation.json")
    with open(evaluation_path, "w") as f:
        json.dump(report_dict, f)
"""
with open('evaluate.py', 'w') as f:
    f.write(evaluation_script)

# 評価ステップの定義
evaluation_processor = ScriptProcessor(
    image_uri=script_processor.image_uri,
    command=['python3'],
    instance_type=processing_instance_type,
    instance_count=1,
    base_job_name='lightgbm-evaluation',
    role=role,
    sagemaker_session=pipeline_session  # PipelineSessionを使用
)

evaluation_report = PropertyFile(
    name='EvaluationReport',
    output_name='evaluation',
    path='evaluation.json',
)

evaluation_step = ProcessingStep(
    name='Evaluation',
    processor=evaluation_processor,
    inputs=[
        ProcessingInput(
            source=training_step.properties.ModelArtifacts.S3ModelArtifacts,
            destination='/opt/ml/processing/model'
        ),
        ProcessingInput(
            source=processing_step.properties.ProcessingOutputConfig.Outputs['validation_data'].S3Output.S3Uri,
            destination='/opt/ml/processing/validation'
        ),
    ],
    outputs=[
        ProcessingOutput(
            output_name='evaluation',
            source='/opt/ml/processing/evaluation',
        ),
    ],
    code='evaluate.py',
    property_files=[evaluation_report],
)

# パイプラインの定義
pipeline = Pipeline(
    name='LightGBM-Pipeline-ModelRegistration',
    parameters=[
        processing_instance_type,
        training_instance_type,
        model_approval_status,
    ],
    steps=[processing_step, 
           training_step, 
           evaluation_step, 
          ],
    sagemaker_session=pipeline_session,  # PipelineSessionを使用
)

# パイプラインの作成と実行
pipeline.upsert(role_arn=role)
execution = pipeline.start()


INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker.image_uris:Defaulting to only supported image scope: cpu.


In [25]:
from sagemaker.model import Model

# 学習ステップで生成されたモデルアーティファクトを取得
model_artifact_uri = training_step.properties.ModelArtifacts.S3ModelArtifacts

# SageMaker Modelオブジェクトの作成
model = Model(
    image_uri=sklearn_estimator.training_image_uri(),  # トレーニングに使用したイメージURI
    model_data=model_artifact_uri,  # トレーニング済みモデルのS3パス
    role=role,
    sagemaker_session=sagemaker_session
)

# リアルタイム推論エンドポイントのデプロイ
predictor = model.deploy(
    initial_instance_count=1,      # エンドポイントで使用するインスタンス数
    instance_type="ml.m5.large",   # エンドポイントのインスタンスタイプ
)



AttributeError: 'Properties' object has no attribute 'decode'