# Import

In [12]:
import os

os.system("pip install --upgrade -q pip")
os.system("pip install --upgrade -q sagemaker")
os.system("pip install --upgrade -q boto3")
os.system("pip install -q mlflow")
os.system("pip install -q sagemaker-mlflow")

0

In [13]:
import boto3
import sagemaker
from sagemaker.session import Session
from sagemaker import get_execution_role

from sagemaker.sklearn.estimator import SKLearn
from sagemaker.tuner import IntegerParameter, HyperparameterTuner

# SageMaker Setting

In [14]:
boto_session = boto3.Session(region_name='ap-southeast-2')
sagemaker_session = Session(default_bucket="mlflow-artifacts-dir", boto_session=boto_session)
role = get_execution_role()
s3_client = boto3.client("s3")

# Define Estimator

In [23]:
estimator = SKLearn(
    entry_point='train.py',
    source_dir='./script/',
    role=role,
    framework_version='1.2-1',
    py_version='py3',
    instance_type='ml.m5.xlarge',
    instance_count=1,
    model_dir='/opt/ml/model/',
    hyperparameters={
        'n_estimators' : 200,
        'max_depth' : 7,
        'max_features' : 10
    },
    sagemaker_session=sagemaker_session
)

# Define Tuner

In [26]:
tuner = HyperparameterTuner(
    estimator=estimator,
    objective_metric_name='test accuracy',
    objective_type='Maximize',
    max_jobs=30,
    max_parallel_jobs=3,
    hyperparameter_ranges={
        'n_estimators' : IntegerParameter(100, 300),
        'max_depth' : IntegerParameter(3, 9),
        'max_features' : IntegerParameter(3, 10)
    },
    metric_definitions=[
        {
            "Name": "test accuracy",
            "Regex": "Test Accuracy: ([0-9\\.]+)",
        }
    ]
)

In [27]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import pandas as pd

def upload_data(sagemaker_session, local_data_dir='./data/'):
    data = load_iris()

    input_data = pd.DataFrame(data.data, columns=data.feature_names)
    target_data = pd.Series(data.target)

    train_input, test_input, train_target, test_target = train_test_split(
        input_data, target_data,
        test_size=0.2,
        random_state=42
    )

    train_input, test_input = pd.DataFrame(train_input, columns=data.feature_names), pd.DataFrame(test_input, columns=data.feature_names)
    train_target, test_target = pd.Series(train_target), pd.Series(test_target)

    train_input.to_csv(os.path.join(local_data_dir, 'train_input.csv'), index=False)
    test_input.to_csv(os.path.join(local_data_dir, 'test_input.csv'), index=False)
    train_target.to_csv(os.path.join(local_data_dir, 'train_target.csv'), index=False)
    test_target.to_csv(os.path.join(local_data_dir, 'test_target.csv'), index=False)

    prefix = 'iris'
    loc = sagemaker_session.upload_data(path=local_data_dir, bucket=sagemaker_session.default_bucket(), key_prefix=prefix)

    return loc

In [28]:
loc = upload_data(sagemaker_session)

loc

's3://mlflow-artifacts-dir/iris'

In [29]:
channels={
    "training" : loc,
    "testing" : loc
}

In [None]:
tuner.fit(inputs=channels)

INFO:sagemaker:Creating hyperparameter tuning job with name: sagemaker-scikit-lea-250824-1514


....