In [2]:
import sklearn
import pandas as pd
import boto3
import os
import numpy as np
from sagemaker import get_execution_role
import sagemaker
import json
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.workflow.parameters import (
    ParameterInteger, 
    ParameterFloat, 
    ParameterString, 
    ParameterBoolean
)

from sagemaker.processing import (
    ProcessingInput, 
    ProcessingOutput, 
    ScriptProcessor
)
from sagemaker.workflow.pipeline import Pipeline

from sagemaker.workflow.steps import (
    ProcessingStep, 
    TrainingStep, 
    CreateModelStep
)
from sagemaker.workflow.check_job_config import CheckJobConfig
from sagemaker.workflow.parameters import (
    ParameterInteger, 
    ParameterFloat, 
    ParameterString, 
    ParameterBoolean
)
from sagemaker.workflow.clarify_check_step import (
    ModelBiasCheckConfig, 
    ClarifyCheckStep, 
    ModelExplainabilityCheckConfig
)
from sagemaker import Model
from sagemaker.inputs import CreateModelInput
from sagemaker.workflow.model_step import ModelStep
from sagemaker.workflow.fail_step import FailStep
from sagemaker.workflow.conditions import (
    ConditionGreaterThan,
    ConditionGreaterThanOrEqualTo
)

from sagemaker.workflow.pipeline_experiment_config import PipelineExperimentConfig
from sagemaker.workflow.properties import PropertyFile
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.functions import (
    Join,
    JsonGet
)

from sagemaker.model_metrics import (
    MetricsSource, 
    ModelMetrics, 
    FileSource
)

from sagemaker.inputs import TrainingInput
from sagemaker.estimator import Estimator
from sagemaker.workflow.steps import TrainingStep

from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep

## Set constants

In [3]:
# Get some variables you need to interact with SageMaker service
boto_session = boto3.Session()
region = boto_session.region_name
bucket_name = sagemaker.Session().default_bucket()
bucket_prefix = "Group-project/simple-linear"  
sm_session = sagemaker.Session()
sm_client = boto_session.client("sagemaker")
sm_role = sagemaker.get_execution_role()

initialized = True

print(sm_role)

arn:aws:iam::567821811420:role/service-role/AmazonSageMaker-ExecutionRole-20230619T084765


In [4]:
# # Store some variables to keep the value between the notebooks
# %store bucket_name
# %store bucket_prefix
# %store sm_role
# %store region
# %store initialized

In [5]:
domain_id = None
NOTEBOOK_METADATA_FILE = "/opt/ml/metadata/resource-metadata.json"

if os.path.exists(NOTEBOOK_METADATA_FILE):
    with open(NOTEBOOK_METADATA_FILE, "r") as f:
        data = json.load(f)
        print(json.dumps(data, indent=4))
else:
    print("There is no metadata file.")

{
    "AppType": "KernelGateway",
    "DomainId": "d-ivd5gnez0yil",
    "UserProfileName": "marmoorh9-gmail-com-d48",
    "ResourceArn": "arn:aws:sagemaker:eu-central-1:567821811420:app/d-ivd5gnez0yil/marmoorh9-gmail-com-d48/KernelGateway/datascience-1-0-ml-t3-medium-ad10824985c0f8abb6eaad94debc",
    "ResourceName": "datascience-1-0-ml-t3-medium-ad10824985c0f8abb6eaad94debc",
    "AppImageVersion": ""
}


In [6]:
# Set names of pipeline objects
project = "solar1"

pipeline_name = f"{project}-pipeline"
pipeline_model_name = f"{project}-model-reg"
model_package_group_name = f"{project}-model-group"
endpoint_config_name = f"{project}-endpoint-config"
endpoint_name = f"{project}-endpoint"

# Set instance types and counts
process_instance_type = "ml.c5.xlarge"
train_instance_count = 1
train_instance_type = "ml.m5.xlarge"

# Set S3 urls for processed data
train_s3_url = f"s3://{bucket_name}/{bucket_prefix}/train"
validation_s3_url = f"s3://{bucket_name}/{bucket_prefix}/validation"
test_s3_url = f"s3://{bucket_name}/{bucket_prefix}/test"
baseline_s3_url = f"s3://{bucket_name}/{bucket_prefix}/baseline"

evaluation_s3_url = f"s3://{bucket_name}/{bucket_prefix}/evaluation"
prediction_baseline_s3_url = f"s3://{bucket_name}/{bucket_prefix}/prediction_baseline"

output_s3_url = f"s3://{bucket_name}/{bucket_prefix}/output"

In [7]:
#store the variable
# %store train_s3_url
# %store validation_s3_url
# %store test_s3_url
# %store baseline_s3_url
# %store model_package_group_name
# %store evaluation_s3_url
# %store prediction_baseline_s3_url
# %store output_s3_url

In [8]:
# Set instance types and counts
process_instance_type = "ml.c5.xlarge"
train_instance_count = 1
train_instance_type = "ml.m5.xlarge"

## Read data and upload it 

In [9]:
#read data and save it in pandas dataframe
df_gen1 = pd.read_csv("data/Plant_1_Generation_Data.csv")
df_gen2 = pd.read_csv("data/Plant_2_Generation_Data.csv")

df_weather1 = pd.read_csv("data/Plant_1_Weather_Sensor_Data.csv")
df_weather2 = pd.read_csv("data//Plant_2_Weather_Sensor_Data.csv")

In [10]:
df_gen1.head()

Unnamed: 0,DATE_TIME,PLANT_ID,SOURCE_KEY,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD
0,15-05-2020 00:00,4135001,1BY6WEcLGh8j5v7,0.0,0.0,0.0,6259559.0
1,15-05-2020 00:00,4135001,1IF53ai7Xc0U56Y,0.0,0.0,0.0,6183645.0
2,15-05-2020 00:00,4135001,3PZuoBAID5Wc2HD,0.0,0.0,0.0,6987759.0
3,15-05-2020 00:00,4135001,7JYdWkrLSPkdwr4,0.0,0.0,0.0,7602960.0
4,15-05-2020 00:00,4135001,McdE0feGgRqW7Ca,0.0,0.0,0.0,7158964.0


In [11]:
# Run this cell to import or install the Data Wrangler widget to show automatic visualization and generate code to fix data quality issues
try:
    import sagemaker_datawrangler
except ImportError:
    !pip install --upgrade sagemaker-datawrangler
    import sagemaker_datawrangler

# Display Pandas DataFrame to view the widget: df, display(df), df.sample()... 

In [12]:
bucket_name

'sagemaker-eu-central-1-567821811420'

In [13]:
#create S3 bucket 
!aws s3 mb s3://sagemaker-eu-central-1-d48
# bucket_name='sagemaker-eu-central-1-d48'
bucket_prefix='solar1/linerreg'

make_bucket failed: s3://sagemaker-eu-central-1-d48 An error occurred (BucketAlreadyOwnedByYou) when calling the CreateBucket operation: Your previous request to create the named bucket succeeded and you already own it.


In [14]:
#Upload data to S3 bucket
s3 = boto3.client('s3')

# Upload Plant_1_Generation_Data.csv
s3.upload_file("data/Plant_1_Generation_Data.csv", bucket_name, "input/Plant_1_Generation_Data.csv")

# Upload Plant_2_Generation_Data.csv
s3.upload_file("data/Plant_2_Generation_Data.csv", bucket_name, "input/Plant_2_Generation_Data.csv")

# Upload Plant_1_Weather_Sensor_Data.csv
s3.upload_file("data/Plant_1_Weather_Sensor_Data.csv", bucket_name, "input/Plant_1_Weather_Sensor_Data.csv")

# Upload Plant_2_Weather_Sensor_Data.csv
s3.upload_file("data/Plant_2_Weather_Sensor_Data.csv", bucket_name, "input/Plant_2_Weather_Sensor_Data.csv")

print("Upload complete.")


Upload complete.


In [15]:
try:
    input_s3_url
except NameError:      
    # If input_s3_url is not defined, upload the datasets to S3 and store the paths
    input_s3_url_gen1 = sagemaker.Session().upload_data(
        path="data/Plant_1_Generation_Data.csv",
        bucket=bucket_name,
        key_prefix=f"{bucket_prefix}/input"
    )
    input_s3_url_gen2 = sagemaker.Session().upload_data(
        path="data/Plant_2_Generation_Data.csv",
        bucket=bucket_name,
        key_prefix=f"{bucket_prefix}/input"
    )
    input_s3_url_weather1 = sagemaker.Session().upload_data(
        path="data/Plant_1_Weather_Sensor_Data.csv",
        bucket=bucket_name,
        key_prefix=f"{bucket_prefix}/input"
    )
    input_s3_url_weather2 = sagemaker.Session().upload_data(
        path="data/Plant_2_Weather_Sensor_Data.csv",
        bucket=bucket_name,
        key_prefix=f"{bucket_prefix}/input"
    )
    print("Upload complete.")

    # %store input_s3_url_gen1
    # %store input_s3_url_gen2
    # %store input_s3_url_weather1
    # %store input_s3_url_weather2


Upload complete.


## Create pipeline
### Setup pipeline parameters

In [16]:
# Set processing instance type
process_instance_type_param = ParameterString(
    name="ProcessingInstanceType",
    default_value=process_instance_type,
)

# Set training instance type
train_instance_type_param = ParameterString(
    name="TrainingInstanceType",
    default_value=train_instance_type,
)

# Set training instance count
train_instance_count_param = ParameterInteger(
    name="TrainingInstanceCount",
    default_value=train_instance_count
)

# Set model approval param
model_approval_status_param = ParameterString(
    name="ModelApprovalStatus",
    default_value="PendingManualApproval"
)

# Minimal threshold for model performance on the test dataset
test_score_threshold_param = ParameterFloat(
    name="TestScoreThreshold", 
    default_value=0.5
)

# Set S3 url for input dataset
input_s3_url_param_g1 = ParameterString(
    name="InputDataUrgen1",
    default_value=input_s3_url_gen1,
)

input_s3_url_param_g2 = ParameterString(
    name="InputDataUrgen2",
    default_value=input_s3_url_gen2,
)

input_s3_url_param_w1= ParameterString(
    name="InputDataUrweather1",
    default_value=input_s3_url_weather1,
)

input_s3_url_param_w2 = ParameterString(
    name="InputDataUrweather2",
    default_value=input_s3_url_weather2,
)

### Build the pipeline steps


In [17]:
session = PipelineSession()

#### Processing step

In [18]:
%%writefile preprocessing.py

import pandas as pd
import numpy as np
import argparse
import os

def _parse_args():
    
    parser = argparse.ArgumentParser()
    # Data, model, and output directories
    # model_dir is always passed in from SageMaker. By default, this is an S3 path under the default bucket.
    parser.add_argument('--filepath', type=str, default='/opt/ml/processing/input/')
    parser.add_argument('--filename', type=str, default='input')
    parser.add_argument('--outputpath', type=str, default='/opt/ml/processing/output/')
    
    return parser.parse_known_args()


if __name__=="__main__":
    # Process arguments
    args, _ = _parse_args()
    
    target_col = "DC_POWER"
    
    # Load data
    df_gen1 = pd.read_csv(os.path.join(args.filepath, 'Plant_1_Generation_Data.csv'))
    df_gen2 = pd.read_csv(os.path.join(args.filepath, 'Plant_2_Generation_Data.csv'))
    df_weather1 = pd.read_csv(os.path.join(args.filepath, 'Plant_1_Weather_Sensor_Data.csv'))
    df_weather2 = pd.read_csv(os.path.join(args.filepath, 'Plant_2_Weather_Sensor_Data.csv'))
    
    # Adjust datetime format
    df_gen1['DATE_TIME'] = pd.to_datetime(df_gen1['DATE_TIME'], format='%d-%m-%Y %H:%M')
    df_weather1['DATE_TIME'] = pd.to_datetime(df_weather1['DATE_TIME'], format='%Y-%m-%d %H:%M:%S')
    df_gen2['DATE_TIME'] = pd.to_datetime(df_gen2['DATE_TIME'], format='%Y-%m-%d %H:%M:%S')  # Updated format
    df_weather2['DATE_TIME'] = pd.to_datetime(df_weather2['DATE_TIME'], format='%Y-%m-%d %H:%M:%S')

    # Drop unnecessary columns and merge dataframes
    df_plant1 = pd.merge(
        df_gen1.drop(columns=['PLANT_ID','AC_POWER','TOTAL_YIELD']),
        df_weather1.drop(columns=['PLANT_ID', 'SOURCE_KEY']),
        on='DATE_TIME'
    )

    df_plant2 = pd.merge(
        df_gen2.drop(columns=['PLANT_ID','AC_POWER','TOTAL_YIELD']),
        df_weather2.drop(columns=['PLANT_ID', 'SOURCE_KEY']),
        on='DATE_TIME'
    )

    combined_plant = pd.concat([df_plant1, df_plant2])
    combined_plant.drop(['SOURCE_KEY', 'DATE_TIME'], axis=1)
    
    
    # Shuffle and split the dataset
    train_data, validation_data, test_data = np.split(
        combined_plant.sample(frac=1, random_state=1729),
        [int(0.7 * len(combined_plant)), int(0.9 * len(combined_plant))],
    )

    print(f"Data split > train:{train_data.shape} | validation:{validation_data.shape} | test:{test_data.shape}")
    
    # Save datasets locally
    train_data.to_csv(os.path.join(args.outputpath, 'train/train.csv'), index=False, header=False)
    validation_data.to_csv(os.path.join(args.outputpath, 'validation/validation.csv'), index=False, header=False)
    test_data[target_col].to_csv(os.path.join(args.outputpath, 'test/test_y.csv'), index=False, header=False)
    test_data.drop([target_col], axis=1).to_csv(os.path.join(args.outputpath, 'test/test_x.csv'), index=False, header=False)


    # Save the baseline dataset for model monitoring
    combined_plant.drop([target_col], axis=1).to_csv(os.path.join(args.outputpath, 'baseline/baseline.csv'), index=False, header=False)
    
    print("## Processing complete. Exiting.")


Overwriting preprocessing.py


In [19]:
import boto3

s3 = boto3.client('s3')
prefix = 'input'  # Specify the prefix to filter the files

response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

if 'Contents' in response:
    print(f"Files in {bucket_name} with prefix '{prefix}':")
    for obj in response['Contents']:
        file_name = obj['Key']
        print(file_name)
else:
    print(f"No files found in {bucket_name} with prefix '{prefix}'")

Files in sagemaker-eu-central-1-567821811420 with prefix 'input':
input/Plant_1_Generation_Data.csv
input/Plant_1_Weather_Sensor_Data.csv
input/Plant_2_Generation_Data.csv
input/Plant_2_Weather_Sensor_Data.csv


In [20]:
# Create SKLearnProcessor
sklearn_processor = SKLearnProcessor(
    framework_version="0.23-1",
    role=sm_role,
    instance_type=process_instance_type_param.default_value,
    instance_count=1,
    base_job_name=f"{pipeline_name}/preprocess",
    sagemaker_session=session,
)


# Define processing inputs
processing_inputs = [
    ProcessingInput(source=input_s3_url_param_g1, destination="/opt/ml/processing/input"),
    ProcessingInput(source=input_s3_url_param_g2, destination="/opt/ml/processing/input"),
    ProcessingInput(source=input_s3_url_param_w1, destination="/opt/ml/processing/input"),
    ProcessingInput(source=input_s3_url_param_w2, destination="/opt/ml/processing/input"),
]

# Define processing outputs
processing_outputs = [
    ProcessingOutput(output_name="train_data", source="/opt/ml/processing/output/train", destination=train_s3_url),
    ProcessingOutput(output_name="validation_data", source="/opt/ml/processing/output/validation", destination=validation_s3_url),
    ProcessingOutput(output_name="test_data", source="/opt/ml/processing/output/test", destination=test_s3_url),
    ProcessingOutput(output_name="baseline_data", source="/opt/ml/processing/output/baseline", destination=baseline_s3_url),
]

# Run the SKLearnProcessor
processor_args = sklearn_processor.run(
    inputs=processing_inputs,
    outputs=processing_outputs,
    code='preprocessing.py',
    # arguments = ['arg1', 'arg2'],
)

# Define processing step
step_process = ProcessingStep(
    name=f"{pipeline_name}-preprocess-data",
    step_args=processor_args,
)




#### Training step

In [21]:
LinearRegression_image_uri = sagemaker.image_uris.retrieve(framework='linear-learner',region=region)

In [22]:
# Instantiate a Linear Learner estimator object
estimator = Estimator(
    image_uri=LinearRegression_image_uri,
    role=get_execution_role(),
    instance_type=train_instance_type_param,
    instance_count=train_instance_count_param,
    output_path=output_s3_url,
    sagemaker_session=session,
    base_job_name=f"{pipeline_name}/train",
)

# Define algorithm hyperparameters
estimator.set_hyperparameters(
           fit_intercept = True, 
               n_jobs = None, 
               copy_X = True,
               n_features_in_=3)

# Define training inputs
training_inputs = {
    "train": TrainingInput(
        s3_data=step_process.properties.ProcessingOutputConfig.Outputs["train_data"].S3Output.S3Uri,
        content_type="text/csv",
    ),
    "validation": TrainingInput(
        s3_data=step_process.properties.ProcessingOutputConfig.Outputs["validation_data"].S3Output.S3Uri,
        content_type="text/csv",
    ),
}

# Fit the estimator to the training data
training_args = estimator.fit(training_inputs)

# Define the training step
step_train = TrainingStep(
    name=f"{pipeline_name}-train",
    step_args=training_args,
)


#### Evaluation step
Create a model evaluation script to check if the model performance meets the specified threshold. 

In [23]:
%%writefile evaluation.py

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

if __name__ == "__main__":   
    
    # All paths are local for the processing container
    model_path = "/opt/ml/processing/model/model.tar.gz"
    test_x_path = "/opt/ml/processing/test/test_x.csv"
    test_y_path = "/opt/ml/processing/test/test_y.csv"
    output_dir = "/opt/ml/processing/evaluation"
    output_prediction_path = "/opt/ml/processing/output/"
        
    # Read model tar file
    with tarfile.open(model_path, "r:gz") as t:
        t.extractall(path=".")
    
    # Load model
    loaded_model = joblib.load("linear_regression_model.pkl")
    
    # Read test data
    X_test = pd.read_csv(test_x_path, header=None).values
    y_test = pd.read_csv(test_y_path, header=None).to_numpy()

    # Run predictions
    test_features_numeric = X_test.drop(['DATE_TIME', 'SOURCE_KEY'], axis=1)
    predictions = model.predict(test_features_numeric)

    # Calculate RMSE
    test_label = pd.DataFrame(y_test)
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    
    # Calculate R2 score
    r2 = r2_score(y_test, predictions)
    
    report_dict = {
        "regression_metrics": {
            "rmse": {
                "value": rmse,
       
            },
        },
    }

    # Save evaluation report
    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
    with open(f"{output_dir}/evaluation.json", "w") as f:
        f.write(json.dumps(report_dict))
    
    # Save prediction baseline file - we need it later for the model quality monitoring
    pd.DataFrame({"prediction": predictions,
                  "label": y_test.squeeze()}
                ).to_csv(os.path.join(output_prediction_path, 'prediction_baseline/prediction_baseline.csv'), index=False, header=True)


Overwriting evaluation.py


Create a processor to run the evaluation script and construct the evaluation step:

In [24]:
script_processor = ScriptProcessor(
    image_uri=LinearRegression_image_uri,
    role=sm_role,
    command=["python3"],
    instance_type=process_instance_type_param,
    instance_count=1,
    base_job_name=f"{pipeline_name}/evaluate",
    sagemaker_session=session,
)
eval_inputs=[
    ProcessingInput(source=step_train.properties.ModelArtifacts.S3ModelArtifacts, 
                    destination="/opt/ml/processing/model"),
    ProcessingInput(source=step_process.properties.ProcessingOutputConfig.Outputs["test_data"].S3Output.S3Uri, 
                    destination="/opt/ml/processing/test"),
]

eval_outputs=[
    ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation", 
                     destination=evaluation_s3_url),
    ProcessingOutput(output_name="prediction_baseline_data", source="/opt/ml/processing/output/prediction_baseline", 
                     destination=prediction_baseline_s3_url),
]

eval_args = script_processor.run(
    inputs=eval_inputs,
    outputs=eval_outputs,
    code="evaluation.py",
)
    
evaluation_report = PropertyFile(
    name="ModelEvaluationReport", output_name="evaluation", path="evaluation.json"
)

step_eval = ProcessingStep(
    name=f"{pipeline_name}-evaluate-model",
    step_args=eval_args,
    property_files=[evaluation_report]
)

#### Register step
The register step creates a SageMaker model and registers a new version of a model in the SageMaker Model Registry within a [model package group](https://docs.aws.amazon.com/sagemaker/latest/dg/model-registry-model-group.html).

In [25]:
model = Model(
    image_uri=LinearRegression_image_uri,        
    model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
    name=f"Solar-energy-regression-model",
    sagemaker_session=session,
    role=sm_role,
)

model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        s3_uri="{}/evaluation.json".format(
            step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
        ),
        content_type="application/json",
    )
)

register_args = model.register(
    content_types=["application/json"],  # Update with appropriate content type for regression
    response_types=["application/json"],  # Update with appropriate response type for regression
    inference_instances=["ml.t2.medium", "ml.m5.xlarge", "ml.m5.large"],
    transform_instances=["ml.m5.xlarge", "ml.m5.large"],
    model_package_group_name=model_package_group_name,
    approval_status=model_approval_status_param,
    model_metrics=model_metrics,
)

step_register = ModelStep(
    name=f"{pipeline_name}-register",
    step_args=register_args
)


#### Fail step
Add a Pipelines [FailStep](https://sagemaker.readthedocs.io/en/stable/workflows/pipelines/sagemaker.workflow.pipelines.html#sagemaker.workflow.fail_step.FailStep) to stop the pipeline execution if the model performance metric doesn't meet the specified threshold. 

In [26]:
step_fail = FailStep(
    name=f"{pipeline_name}-fail",
    error_message=Join(on=" ", values=["Execution failed due to RMSE >", test_score_threshold_param]),
)

#### Condition step
The condition step checks the model performance score and conditionally creates a model and registers it in the model registry, or stops and fails the pipeline execution.

In [28]:
cond_lte = ConditionGreaterThan(
    left=JsonGet(
        step_name=step_eval.name,
        property_file=evaluation_report,
        json_path="regression_metrics.rmse.value",
    ),
    right=test_score_threshold_param,
)

step_cond = ConditionStep(
    name=f"{pipeline_name}-check-test-score",
    conditions=[cond_lte],
    if_steps=[step_register],
    else_steps=[step_fail],
    )

#### Construct the pipeline 


In [29]:
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        process_instance_type_param,
        train_instance_type_param,
        train_instance_count_param,
        model_approval_status_param,
        test_score_threshold_param,
        input_s3_url_param_g1,
        input_s3_url_param_g2,
        input_s3_url_param_w1,
        input_s3_url_param_w2,
    ],
    steps=[step_process, step_train, step_eval, step_cond],
    sagemaker_session=session,
)

In [30]:
 %pip install sagemaker==2.132.0

[0mNote: you may need to restart the kernel to use updated packages.


In [34]:
# Create a new or update existing Pipeline
pipeline.upsert(role_arn=sm_role)

{'PipelineArn': 'arn:aws:sagemaker:eu-central-1:567821811420:pipeline/solar1-pipeline',
 'ResponseMetadata': {'RequestId': '8e92a036-576e-4a87-bcbe-80d2c069f4d5',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '8e92a036-576e-4a87-bcbe-80d2c069f4d5',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '86',
   'date': 'Mon, 26 Jun 2023 05:20:09 GMT'},
  'RetryAttempts': 0}}

In [35]:
pipeline_definition = json.loads(pipeline.describe()['PipelineDefinition'])
pipeline_definition

{'Version': '2020-12-01',
 'Metadata': {},
 'Parameters': [{'Name': 'ProcessingInstanceType',
   'Type': 'String',
   'DefaultValue': 'ml.c5.xlarge'},
  {'Name': 'TrainingInstanceType',
   'Type': 'String',
   'DefaultValue': 'ml.m5.xlarge'},
  {'Name': 'TrainingInstanceCount', 'Type': 'Integer', 'DefaultValue': 1},
  {'Name': 'ModelApprovalStatus',
   'Type': 'String',
   'DefaultValue': 'PendingManualApproval'},
  {'Name': 'TestScoreThreshold', 'Type': 'Float', 'DefaultValue': 0.5},
  {'Name': 'InputDataUrgen1',
   'Type': 'String',
   'DefaultValue': 's3://sagemaker-eu-central-1-567821811420/solar1/linerreg/input/Plant_1_Generation_Data.csv'},
  {'Name': 'InputDataUrgen2',
   'Type': 'String',
   'DefaultValue': 's3://sagemaker-eu-central-1-567821811420/solar1/linerreg/input/Plant_2_Generation_Data.csv'},
  {'Name': 'InputDataUrweather1',
   'Type': 'String',
   'DefaultValue': 's3://sagemaker-eu-central-1-567821811420/solar1/linerreg/input/Plant_1_Weather_Sensor_Data.csv'},
  {'Nam

## Execute the pipeline
The following code starts an execution of the pipeline with the specified parameters.

In [36]:
execution = pipeline.start(
    parameters=dict(
        ProcessingInstanceType=process_instance_type,
        TrainingInstanceType=train_instance_type,
        TrainingInstanceCount=train_instance_count,
        ModelApprovalStatus="PendingManualApproval",
        TestScoreThreshold=0.75,
        InputDataUrgen1=input_s3_url_gen1,
        InputDataUrgen2=input_s3_url_gen2,
        InputDataUrweather1=input_s3_url_weather1,
        InputDataUrweather2=input_s3_url_weather2,
    )
)


In [37]:
# Un-comment this call if you want the notebook to wait until the pipeline's execution finished
execution.wait()
execution.list_steps()

WaiterError: Waiter PipelineExecutionComplete failed: Waiter encountered a terminal failure state: For expression "PipelineExecutionStatus" we matched expected path: "Failed"

In [None]:
execution.list_steps()