In [29]:
import sklearn
import pandas as pd
import boto3
import os
import numpy as np
from sagemaker import get_execution_role
import sagemaker
import json
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.workflow.parameters import (
    ParameterInteger, 
    ParameterFloat, 
    ParameterString, 
    ParameterBoolean
)

from sagemaker.processing import (
    ProcessingInput, 
    ProcessingOutput, 
    ScriptProcessor
)
from sagemaker.workflow.pipeline import Pipeline

from sagemaker.workflow.steps import (
    ProcessingStep, 
    TrainingStep, 
    CreateModelStep
)
from sagemaker.workflow.check_job_config import CheckJobConfig
from sagemaker.workflow.parameters import (
    ParameterInteger, 
    ParameterFloat, 
    ParameterString, 
    ParameterBoolean
)
from sagemaker.workflow.clarify_check_step import (
    ModelBiasCheckConfig, 
    ClarifyCheckStep, 
    ModelExplainabilityCheckConfig
)
from sagemaker import Model
from sagemaker.inputs import CreateModelInput
from sagemaker.workflow.model_step import ModelStep
from sagemaker.workflow.fail_step import FailStep
from sagemaker.workflow.conditions import (
    ConditionGreaterThan,
    ConditionGreaterThanOrEqualTo
)

from sagemaker.workflow.pipeline_experiment_config import PipelineExperimentConfig
from sagemaker.workflow.properties import PropertyFile
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.functions import (
    Join,
    JsonGet
)

from sagemaker.model_metrics import (
    MetricsSource, 
    ModelMetrics, 
    FileSource
)

from sagemaker.inputs import TrainingInput
from sagemaker.estimator import Estimator
from sagemaker.workflow.steps import TrainingStep

from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep

## Set constants

In [30]:
# #create S3 bucket 
# !aws s3 mb s3://sagemaker-eu-central-1-d50
# bucket_name='sagemaker-eu-central-1-d50'
# bucket_prefix='solar1/linerreg'

In [31]:
# Get some variables you need to interact with SageMaker service
# Get some variables you need to interact with SageMaker service
boto_session = boto3.Session()
region = boto_session.region_name
bucket_name = sagemaker.Session().default_bucket()
bucket_prefix = "solar2/gblinear"  
sm_session = sagemaker.Session()
sm_client = boto_session.client("sagemaker")
sm_role = sagemaker.get_execution_role()

initialized = True

print(sm_role)

arn:aws:iam::531485126105:role/service-role/AmazonSageMaker-ExecutionRole-20230614T171444


In [32]:
bucket_name

'sagemaker-us-east-1-531485126105'

In [33]:
# Store some variables to keep the value between the notebooks
%store bucket_name
%store bucket_prefix
%store sm_role
%store region
%store initialized

Stored 'bucket_name' (str)
Stored 'bucket_prefix' (str)
Stored 'sm_role' (str)
Stored 'region' (str)
Stored 'initialized' (bool)


In [34]:
# domain_id = None
# NOTEBOOK_METADATA_FILE = "/opt/ml/metadata/resource-metadata.json"

# if os.path.exists(NOTEBOOK_METADATA_FILE):
#     with open(NOTEBOOK_METADATA_FILE, "r") as f:
#         data = json.load(f)
#         print(json.dumps(data, indent=4))
# else:
#     print("There is no metadata file.")

In [35]:
# Set names of pipeline objects
project = "solar2"

pipeline_name = f"{project}-pipeline"
pipeline_model_name = f"{project}-model-reg"
model_package_group_name = f"{project}-model-group"
endpoint_config_name = f"{project}-endpoint-config"
endpoint_name = f"{project}-endpoint"

# Set instance types and counts
process_instance_type = "ml.c5.xlarge"
train_instance_count = 1
train_instance_type = "ml.m5.xlarge"

# Set S3 urls for processed data
train_s3_url = f"s3://{bucket_name}/{bucket_prefix}/train"
validation_s3_url = f"s3://{bucket_name}/{bucket_prefix}/validation"
test_s3_url = f"s3://{bucket_name}/{bucket_prefix}/test"
baseline_s3_url = f"s3://{bucket_name}/{bucket_prefix}/baseline"

evaluation_s3_url = f"s3://{bucket_name}/{bucket_prefix}/evaluation"
prediction_baseline_s3_url = f"s3://{bucket_name}/{bucket_prefix}/prediction_baseline"

output_s3_url = f"s3://{bucket_name}/{bucket_prefix}/output"

In [36]:
bucket_prefix

'solar2/gblinear'

In [37]:
# store the variable
%store train_s3_url
%store validation_s3_url
%store test_s3_url
%store baseline_s3_url
%store model_package_group_name
%store evaluation_s3_url
%store prediction_baseline_s3_url
%store output_s3_url

Stored 'train_s3_url' (str)
Stored 'validation_s3_url' (str)
Stored 'test_s3_url' (str)
Stored 'baseline_s3_url' (str)
Stored 'model_package_group_name' (str)
Stored 'evaluation_s3_url' (str)
Stored 'prediction_baseline_s3_url' (str)
Stored 'output_s3_url' (str)


In [38]:
print(f"Train S3 url: {train_s3_url}")
print(f"Validation S3 url: {validation_s3_url}")
print(f"Test S3 url: {test_s3_url}")
print(f"Data baseline S3 url: {baseline_s3_url}")
print(f"Evaluation metrics S3 url: {evaluation_s3_url}")
print(f"Model prediction baseline S3 url: {prediction_baseline_s3_url}")
# print(f"target_col: {target_col}")

Train S3 url: s3://sagemaker-us-east-1-531485126105/solar2/gblinear/train
Validation S3 url: s3://sagemaker-us-east-1-531485126105/solar2/gblinear/validation
Test S3 url: s3://sagemaker-us-east-1-531485126105/solar2/gblinear/test
Data baseline S3 url: s3://sagemaker-us-east-1-531485126105/solar2/gblinear/baseline
Evaluation metrics S3 url: s3://sagemaker-us-east-1-531485126105/solar2/gblinear/evaluation
Model prediction baseline S3 url: s3://sagemaker-us-east-1-531485126105/solar2/gblinear/prediction_baseline


In [39]:
# Set instance types and counts
process_instance_type = "ml.c5.xlarge"
train_instance_count = 1
train_instance_type = "ml.m5.xlarge"

In [40]:
store

Stored variables and their in-db values:
athena_table_name                      -> 'sagemaker_workshop_e2e_churn_1686747619'
baseline_s3_url                        -> 's3://sagemaker-us-east-1-531485126105/solar2/gbli
bucket                                 -> 'sagemaker-studio-us-east-1-531485126105'
bucket_name                            -> 'sagemaker-us-east-1-531485126105'
bucket_prefix                          -> 'solar2/gblinear'
churn_feature_group_name               -> 'sagemaker-workshop-e2e-churn'
docker_image_name                      -> '683313688378.dkr.ecr.us-east-1.amazonaws.com/sage
domain_id                              -> 'd-1qvmpqvqiuve'
evaluation_s3_url                      -> 's3://sagemaker-us-east-1-531485126105/solar2/gbli
experiment_name                        -> 'Solar-Power-experiment-25-10-00-09'
framework_version                      -> '1.3-1'
initialized                            -> True
input_s3_url                           -> 's3://sagemaker-us-east-1

### See the data

In [41]:
#read data
solar_power = pd.read_csv("data/combined_plant.csv")
solar_power.head()

Unnamed: 0,DATE_TIME,SOURCE_KEY,DC_POWER,DAILY_YIELD,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION
0,2020-05-15 00:00:00,1BY6WEcLGh8j5v7,0.0,0.0,25.184316,22.857507,0.0
1,2020-05-15 00:00:00,1IF53ai7Xc0U56Y,0.0,0.0,25.184316,22.857507,0.0
2,2020-05-15 00:00:00,3PZuoBAID5Wc2HD,0.0,0.0,25.184316,22.857507,0.0
3,2020-05-15 00:00:00,7JYdWkrLSPkdwr4,0.0,0.0,25.184316,22.857507,0.0
4,2020-05-15 00:00:00,McdE0feGgRqW7Ca,0.0,0.0,25.184316,22.857507,0.0


In [42]:
solar_power.columns

Index(['DATE_TIME', 'SOURCE_KEY', 'DC_POWER', 'DAILY_YIELD',
       'AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION'],
      dtype='object')

## Read data and upload it 

In [43]:
# #Upload data to S3 bucket
# s3 = boto3.client('s3')

# # Upload Plant_1_Generation_Data.csv
# s3.upload_file("data/combined_plant.csv", bucket_name, "input/solar_power.csv")

In [44]:
# If input_s3_url is not defined, upload the dataset to S3 and store the path
input_s3_url = sagemaker.Session().upload_data(
    # path="data/bank-additional/bank-additional-full.csv",
    path="data/combined_plant.csv",
    bucket=bucket_name,
        key_prefix=f"{bucket_prefix}/input"
)
print(f"Upload the dataset to {input_s3_url}")

%store input_s3_url

Upload the dataset to s3://sagemaker-us-east-1-531485126105/solar2/gblinear/input/combined_plant.csv
Stored 'input_s3_url' (str)


In [45]:
# check is it in s3

data_input_file = pd.read_csv(input_s3_url)
data_input_file.head()

Unnamed: 0,DATE_TIME,SOURCE_KEY,DC_POWER,DAILY_YIELD,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION
0,2020-05-15 00:00:00,1BY6WEcLGh8j5v7,0.0,0.0,25.184316,22.857507,0.0
1,2020-05-15 00:00:00,1IF53ai7Xc0U56Y,0.0,0.0,25.184316,22.857507,0.0
2,2020-05-15 00:00:00,3PZuoBAID5Wc2HD,0.0,0.0,25.184316,22.857507,0.0
3,2020-05-15 00:00:00,7JYdWkrLSPkdwr4,0.0,0.0,25.184316,22.857507,0.0
4,2020-05-15 00:00:00,McdE0feGgRqW7Ca,0.0,0.0,25.184316,22.857507,0.0


In [46]:
#check if data in bucket_name

s3 = boto3.client('s3')
prefix =bucket_prefix

response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

if 'Contents' in response:
    print(f"Files in {bucket_name} with prefix '{prefix}':")
    for obj in response['Contents']:
        file_name = obj['Key']
        print(file_name)
else:
    print(f"No files found in {bucket_name} with prefix '{prefix}'")

Files in sagemaker-us-east-1-531485126105 with prefix 'solar2/gblinear':
solar2/gblinear/baseline/baseline.csv
solar2/gblinear/input/combined_plant.csv
solar2/gblinear/output/pipelines-rbgrweb2clxo-solar2-pipeline-trai-nrli042x34/debug-output/training_job_end.ts
solar2/gblinear/output/pipelines-rbgrweb2clxo-solar2-pipeline-trai-nrli042x34/profiler-output/framework/training_job_end.ts
solar2/gblinear/output/pipelines-rbgrweb2clxo-solar2-pipeline-trai-nrli042x34/profiler-output/system/incremental/2023062808/1687942200.algo-1.json
solar2/gblinear/output/pipelines-rbgrweb2clxo-solar2-pipeline-trai-nrli042x34/profiler-output/system/incremental/2023062808/1687942260.algo-1.json
solar2/gblinear/output/pipelines-rbgrweb2clxo-solar2-pipeline-trai-nrli042x34/profiler-output/system/incremental/2023062808/1687942320.algo-1.json
solar2/gblinear/output/pipelines-rbgrweb2clxo-solar2-pipeline-trai-nrli042x34/profiler-output/system/training_job_end.ts
solar2/gblinear/test/test_x.csv
solar2/gblinear/tes

## Create pipeline
### Setup pipeline parameters

In [47]:
# Set processing instance type
process_instance_type_param = ParameterString(
    name="ProcessingInstanceType",
    default_value=process_instance_type,
)

# Set training instance type
train_instance_type_param = ParameterString(
    name="TrainingInstanceType",
    default_value=train_instance_type,
)

# Set training instance count
train_instance_count_param = ParameterInteger(
    name="TrainingInstanceCount",
    default_value=train_instance_count
)

# Set model approval param
model_approval_status_param = ParameterString(
    name="ModelApprovalStatus",
    default_value="PendingManualApproval"
)

# Minimal threshold for model performance on the test dataset
test_score_threshold_param = ParameterFloat(
    name="TestScoreThreshold", 
    default_value=0.5
)

# Set S3 url for input dataset
input_s3_url_param = ParameterString(
    name="InputDataUrl",
    default_value=input_s3_url,
)


In [48]:
input_s3_url_param

ParameterString(name='InputDataUrl', parameter_type=<ParameterTypeEnum.STRING: 'String'>, default_value='s3://sagemaker-us-east-1-531485126105/solar2/gblinear/input/combined_plant.csv')

### Build the pipeline steps


In [49]:
session = PipelineSession()

#### Processing step

In [50]:
%%writefile preprocessing.py

import pandas as pd
import numpy as np
import argparse
import os

def _parse_args():
    
    parser = argparse.ArgumentParser()
    # Data, model, and output directories
    # model_dir is always passed in from SageMaker. By default this is a S3 path under the default bucket.
    parser.add_argument('--filepath', type=str, default='/opt/ml/processing/input/')
    parser.add_argument('--filename', type=str, default='combined_plant.csv')
    parser.add_argument('--outputpath', type=str, default='/opt/ml/processing/output/')
    
    return parser.parse_known_args()


if __name__=="__main__":
    # Process arguments
    args, _ = _parse_args()
    
    target_col = 'DC_POWER'
    
    # Load data
    df_model_data = pd.read_csv(os.path.join(args.filepath, args.filename), sep=",")
    print(df_model_data.columns)
    print(df_model_data.head(5))

    df_model_data.drop(['SOURCE_KEY', 'DATE_TIME'], axis=1)
    
     # Shuffle and splitting dataset
    train_data, validation_data, test_data = np.split(
        df_model_data.sample(frac=1, random_state=1729),
        [int(0.7 * len(df_model_data)), int(0.9 * len(df_model_data))],
    )

    print(f"Data split > train:{train_data.shape} | validation:{validation_data.shape} | test:{test_data.shape}")
    
    # Save datasets locally
    try:
        train_data.to_csv(os.path.join(args.outputpath, 'train/train.csv'), index=False, header=False)
        print("Train data saved successfully.")
    except Exception as e:
        print("Error saving train data:", str(e))
    try:
        validation_data.to_csv(os.path.join(args.outputpath, 'validation/validation.csv'), index=False, header=False)
        print("Validation data saved successfully.")
    except Exception as e:
        print("Error saving validation data:", str(e))

    try:
        test_data[target_col].to_csv(os.path.join(args.outputpath, 'test/test_y.csv'), index=False, header=False)
        print("Test target data saved successfully.")
    except Exception as e:
        print("Error saving test target data:", str(e))

    try:
        test_data.drop([target_col], axis=1).to_csv(os.path.join(args.outputpath, 'test/test_x.csv'), index=False, header=False)
        print("Test input data saved successfully.")
    except Exception as e:
        print("Error saving test input data:", str(e))

    try:
        df_model_data.drop([target_col], axis=1).to_csv(os.path.join(args.outputpath, 'baseline/baseline.csv'), index=False, header=False)
        print("Baseline data saved successfully.")
    except Exception as e:
        print("Error saving baseline data:", str(e))

    
    print("## Processing complete. Exiting.")


Overwriting preprocessing.py


In [51]:
# Create SKLearnProcessor
sklearn_processor = SKLearnProcessor(
    framework_version="0.23-1",
    role=sm_role,
    instance_type=process_instance_type_param.default_value,
    instance_count=1,
    base_job_name=f"{pipeline_name}/preprocess",
    sagemaker_session=session,
)

    
processing_inputs=[
    ProcessingInput(source=input_s3_url_param, destination="/opt/ml/processing/input")
]

processing_outputs=[
    ProcessingOutput(output_name="train_data", source="/opt/ml/processing/output/train", 
                     destination=train_s3_url),
    ProcessingOutput(output_name="validation_data", source="/opt/ml/processing/output/validation",
                     destination=validation_s3_url),
    ProcessingOutput(output_name="test_data", source="/opt/ml/processing/output/test",
                     destination=test_s3_url),
    ProcessingOutput(output_name="baseline_data", source="/opt/ml/processing/output/baseline", 
                     destination=baseline_s3_url),
]

processor_args = sklearn_processor.run(
    inputs=processing_inputs,
    outputs=processing_outputs,
    code='preprocessing.py',
    # arguments = ['arg1', 'arg2'],
)
    
# Define processing step
step_process = ProcessingStep(
    name=f"{pipeline_name}-preprocess-data",
    step_args=processor_args,
)



#### Training step

In [52]:
LinearRegression_image_uri = sagemaker.image_uris.retrieve(framework='linear-learner',region=region)

In [53]:
# Instantiate a Linear Learner estimator object
estimator = Estimator(
    image_uri=LinearRegression_image_uri,
    role=get_execution_role(),
    instance_type=train_instance_type_param,
    instance_count=train_instance_count_param,
    output_path=output_s3_url,
    sagemaker_session=session,
    base_job_name=f"{pipeline_name}/train",
)

# # Define algorithm hyperparameters
# estimator.set_hyperparameters(
#     booster="gblinear", 
#     eta=0.9,
#     base_score=0.5, 
#     objective="reg:squarederror",
#     eval_metric="rmse", # evaluation metrics for validation data
#     random_state=567, 
#  )

estimator.set_hyperparameters(
    booster="gblinear",
    # lambda=0.1,
    alpha=0.01,
    updater="shotgun",
    feature_selector="cyclic",
    top_k=0,
    eta=0.9,
    base_score=0.5,
    objective="reg:squarederror",
    eval_metric="rmse",
    random_state=567,
)


# Define training inputs
training_inputs = {
    "train": TrainingInput(
        s3_data=step_process.properties.ProcessingOutputConfig.Outputs["train_data"].S3Output.S3Uri,
        content_type="text/csv",
    ),
    "validation": TrainingInput(
        s3_data=step_process.properties.ProcessingOutputConfig.Outputs["validation_data"].S3Output.S3Uri,
        content_type="text/csv",
    ),
}

# Fit the estimator to the training data
training_args = estimator.fit(training_inputs)

# Define the training step
step_train = TrainingStep(
    name=f"{pipeline_name}-train",
    step_args=training_args,
)


#### Evaluation step
Create a model evaluation script to check if the model performance meets the specified threshold. 

In [54]:
%%writefile evaluation.py

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
import json
import os
import pathlib
import pickle as pkl
import tarfile
import joblib
import numpy as np
import pandas as pd
import xgboost as xgb
import datetime as dt

if __name__ == "__main__":   
    
    # All paths are local for the processing container
    model_path = "/opt/ml/processing/model/model.tar.gz"
    test_x_path = "/opt/ml/processing/test/test_x.csv"
    test_y_path = "/opt/ml/processing/test/test_y.csv"
    output_dir = "/opt/ml/processing/evaluation"
    output_prediction_path = "/opt/ml/processing/output/"
        
    # Read model tar file
    with tarfile.open(model_path, "r:gz") as t:
        t.extractall(path=".")
    
    # Load model
    model = xgb.Booster()
    model.load_model("gblinear-model")
    
    # Read test data
    X_test = xgb.DMatrix(pd.read_csv(test_x_path, header=None).values)
    y_test = pd.read_csv(test_y_path, header=None).to_numpy()

    # Run predictions
    test_features_numeric = X_test.drop(['DATE_TIME', 'SOURCE_KEY'], axis=1)
    predictions = model.predict(test_features_numeric)

    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_test,predictions))
    

    
    report_dict = {
        "regression_metrics": {
            "rmse": {
                "value": rmse,
       
            },
        },
    }

  # Save evaluation report
    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
    with open(f"{output_dir}/evaluation.json", "w") as f:
        f.write(json.dumps(report_dict))
    
    # Save prediction baseline file - we need it later for the model quality monitoring
    pd.DataFrame({"prediction":np.array(np.round(probability), dtype=int),
                  "probability":probability,
                  "label":y_test.squeeze()}
                ).to_csv(os.path.join(output_prediction_path, 'prediction_baseline/prediction_baseline.csv'), index=False, header=True)

Overwriting evaluation.py


Create a processor to run the evaluation script and construct the evaluation step:

In [55]:
script_processor = ScriptProcessor(
    image_uri=LinearRegression_image_uri,
    role=sm_role,
    command=["python3"],
    instance_type=process_instance_type_param,
    instance_count=1,
    base_job_name=f"{pipeline_name}/evaluate",
    sagemaker_session=session,
)

eval_inputs=[
    ProcessingInput(source=step_train.properties.ModelArtifacts.S3ModelArtifacts, 
                    destination="/opt/ml/processing/model"),
    ProcessingInput(source=step_process.properties.ProcessingOutputConfig.Outputs["test_data"].S3Output.S3Uri, 
                    destination="/opt/ml/processing/test"),
]

eval_outputs=[
    ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation", 
                     destination=evaluation_s3_url),
    ProcessingOutput(output_name="prediction_baseline_data", source="/opt/ml/processing/output/prediction_baseline", 
                     destination=prediction_baseline_s3_url),
]

eval_args = script_processor.run(
    inputs=eval_inputs,
    outputs=eval_outputs,
    code="evaluation.py",
)
    
evaluation_report = PropertyFile(
    name="ModelEvaluationReport", output_name="evaluation", path="evaluation.json"
)

step_eval = ProcessingStep(
    name=f"{pipeline_name}-evaluate-model",
    step_args=eval_args,
    property_files=[evaluation_report]
)

#### Register step
The register step creates a SageMaker model and registers a new version of a model in the SageMaker Model Registry within a [model package group](https://docs.aws.amazon.com/sagemaker/latest/dg/model-registry-model-group.html).

In [56]:
model = Model(
    image_uri=LinearRegression_image_uri,        
    model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
    name=f"gblinear-model",
    sagemaker_session=session,
    role=sm_role,
)

model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        s3_uri="{}/evaluation.json".format(
            step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
        ),
        content_type="application/json",
    )
)

register_args = model.register(
    content_types=["text/csv"],
    response_types=["text/csv"],
    inference_instances=["ml.t2.medium", "ml.m5.xlarge", "ml.m5.large"],
    transform_instances=["ml.m5.xlarge", "ml.m5.large"],
    model_package_group_name=model_package_group_name,
    approval_status=model_approval_status_param,
    model_metrics=model_metrics,
)

step_register = ModelStep(
    name=f"{pipeline_name}-register",
    step_args=register_args
)


#### Fail step
Add a Pipelines [FailStep](https://sagemaker.readthedocs.io/en/stable/workflows/pipelines/sagemaker.workflow.pipelines.html#sagemaker.workflow.fail_step.FailStep) to stop the pipeline execution if the model performance metric doesn't meet the specified threshold. 

In [57]:
step_fail = FailStep(
    name=f"{pipeline_name}-fail",
    error_message=Join(on=" ", values=["Execution failed due to RMSE >", test_score_threshold_param]),
)

#### Condition step
The condition step checks the model performance score and conditionally creates a model and registers it in the model registry, or stops and fails the pipeline execution.

In [58]:
cond_lte = ConditionGreaterThan(
    left=JsonGet(
        step_name=step_eval.name,
        property_file=evaluation_report,
        json_path="regression_metrics.rmse.value",
    ),
    right=test_score_threshold_param,
)

step_cond = ConditionStep(
    name=f"{pipeline_name}-check-test-score",
    conditions=[cond_lte],
    if_steps=[step_register],
    else_steps=[step_fail],
    )


#### Construct the pipeline 


In [59]:
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        process_instance_type_param,
        train_instance_type_param,
        train_instance_count_param,
        model_approval_status_param,
        test_score_threshold_param,
        input_s3_url_param,
    ],
    steps=[step_process, step_train, step_eval, step_cond],
    sagemaker_session=session,
)

In [60]:
 %pip install sagemaker==2.132.0

[0mNote: you may need to restart the kernel to use updated packages.


In [61]:
# Create a new or update existing Pipeline
pipeline.upsert(role_arn=sm_role)



{'PipelineArn': 'arn:aws:sagemaker:us-east-1:531485126105:pipeline/solar2-pipeline',
 'ResponseMetadata': {'RequestId': 'c00e591d-f325-4e90-bc91-2fc639abd5c6',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'c00e591d-f325-4e90-bc91-2fc639abd5c6',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '83',
   'date': 'Wed, 28 Jun 2023 09:11:39 GMT'},
  'RetryAttempts': 0}}

In [62]:
pipeline_definition = json.loads(pipeline.describe()['PipelineDefinition'])
pipeline_definition

{'Version': '2020-12-01',
 'Metadata': {},
 'Parameters': [{'Name': 'ProcessingInstanceType',
   'Type': 'String',
   'DefaultValue': 'ml.c5.xlarge'},
  {'Name': 'TrainingInstanceType',
   'Type': 'String',
   'DefaultValue': 'ml.m5.xlarge'},
  {'Name': 'TrainingInstanceCount', 'Type': 'Integer', 'DefaultValue': 1},
  {'Name': 'ModelApprovalStatus',
   'Type': 'String',
   'DefaultValue': 'PendingManualApproval'},
  {'Name': 'TestScoreThreshold', 'Type': 'Float', 'DefaultValue': 0.5},
  {'Name': 'InputDataUrl',
   'Type': 'String',
   'DefaultValue': 's3://sagemaker-us-east-1-531485126105/solar2/gblinear/input/combined_plant.csv'}],
 'PipelineExperimentConfig': {'ExperimentName': {'Get': 'Execution.PipelineName'},
  'TrialName': {'Get': 'Execution.PipelineExecutionId'}},
 'Steps': [{'Name': 'solar2-pipeline-preprocess-data',
   'Type': 'Processing',
   'Arguments': {'ProcessingResources': {'ClusterConfig': {'InstanceType': 'ml.c5.xlarge',
      'InstanceCount': 1,
      'VolumeSizeInGB

## Execute the pipeline
The following code starts an execution of the pipeline with the specified parameters.

In [63]:
execution = pipeline.start(
    parameters=dict(
        ProcessingInstanceType=process_instance_type,
        TrainingInstanceType=train_instance_type,
        TrainingInstanceCount=train_instance_count,
        ModelApprovalStatus="PendingManualApproval",
        TestScoreThreshold=0.75,
        InputDataUrl=input_s3_url
    )
)


In [64]:
# Un-comment this call if you want the notebook to wait until the pipeline's execution finished
execution.wait()
execution.list_steps()

WaiterError: Waiter PipelineExecutionComplete failed: Waiter encountered a terminal failure state: For expression "PipelineExecutionStatus" we matched expected path: "Failed"

In [None]:
execution.list_steps()