In [2]:
import sagemaker
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.inputs import TrainingInput
from sagemaker.estimator import Estimator

In [20]:
# Role and session setup
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

In [21]:
# Step 1: Data Preprocessing Step
sklearn_processor = SKLearnProcessor(
    framework_version='0.23-1',
    role=role,
    instance_type='ml.m5.large',
    instance_count=1
    
)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


In [22]:
preprocessing_step = ProcessingStep(
    name="PreprocessData",
    processor=sklearn_processor,
    inputs=[
        ProcessingInput(source="s3://drybean-csv/data/drybean.csv", destination="/opt/ml/processing/input"),
    ],
    outputs=[
        ProcessingOutput(source="/opt/ml/processing/output/train", destination="s3://drybean-csv/data/processed/train"),
        ProcessingOutput(source="/opt/ml/processing/output/val", destination="s3://drybean-csv/data/processed/val"),
        ProcessingOutput(source="/opt/ml/processing/output/test", destination="s3://drybean-csv/data/processed/test"),
    ],
    code="preprocessing.py"  # Path to the preprocessing script
)

In [23]:
# Step 2: Training Step
xgb_estimator = Estimator(
    image_uri=sagemaker.image_uris.retrieve(framework='xgboost', region=sagemaker_session.boto_session.region_name, version='1.2-1'),
    role=role,
    instance_count=1,
    instance_type='ml.m5.large',
    output_path="s3://drybean-csv/model-output",
    hyperparameters={
        "max_depth": 5,
        "eta": 0.2,
        "gamma": 4,
        "min_child_weight": 6,
        "subsample": 0.8,
        "objective": "multi:softmax",
        "num_class": 7,
        "num_round": 100,
    }
)

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


In [24]:
training_step = TrainingStep(
    name="TrainModel",
    estimator=xgb_estimator,
    inputs={
        "train": TrainingInput(s3_data="s3://drybean-csv/data/processed/train", content_type="text/csv"),
        "validation": TrainingInput(s3_data="s3://drybean-csv/data/processed/val", content_type="text/csv"),
    }
)

In [26]:
from sagemaker.workflow.pipeline import Pipeline

# Define the pipeline
pipeline = Pipeline(
    name='MyPipeline001',
    steps=[preprocessing_step],
    sagemaker_session=sagemaker_session
)

# Create and run the pipeline
pipeline.create(role_arn=role)
pipeline.start()






_PipelineExecution(arn='arn:aws:sagemaker:us-east-2:767398042415:pipeline/MyPipeline001/execution/clblxb0sezm5', sagemaker_session=<sagemaker.session.Session object at 0x7ff6e396b3d0>)