In [1]:
from datetime import datetime, timedelta, timezone
import json
import os
import re
import boto3
from time import sleep
from threading import Thread

import pandas as pd

from sagemaker import get_execution_role, session, Session, image_uris
from sagemaker.s3 import S3Downloader, S3Uploader
from sagemaker.processing import ProcessingJob
from sagemaker.serializers import CSVSerializer

from sagemaker.model import Model
from sagemaker.model_monitor import DataCaptureConfig

from sagemaker.model_monitor import ModelQualityMonitor, ModelBiasMonitor
from sagemaker.model_monitor import EndpointInput
from sagemaker.model_monitor.dataset_format import DatasetFormat

session = Session()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [2]:
bucket = session.default_bucket()
print(bucket)

sagemaker-us-east-1-122442915072


In [3]:
import os
import boto3
import sagemaker

role = sagemaker.get_execution_role()
sess = sagemaker.Session()
region = sess.boto_region_name

bucket = sess.default_bucket()
prefix = "xgboost-final"

### Deploy Model

In [4]:
endpoint_name = f"xgb-Final-project-{datetime.utcnow():%Y-%m-%d-%H%M}"
s3_capture_upload_path = f"s3://{bucket}/{prefix}/capture"

In [5]:
data_capture_config = DataCaptureConfig(
    enable_capture=True, sampling_percentage=100, destination_s3_uri=s3_capture_upload_path
)

model = Model(image_uri="683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.7-1", model_data="s3://sagemaker-us-east-1-122442915072/xgboost-final/output/xgb-2024-06-23-01-57-14/xgb-2024-06-23-01-57-14/output/model.tar.gz", role=role, sagemaker_session=session)


In [6]:
model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name,
    data_capture_config=data_capture_config,
)

-------!

In [7]:
from sagemaker.predictor import Predictor

predictor = Predictor(
    endpoint_name=endpoint_name, sagemaker_session=session, serializer=CSVSerializer()
)

### Monitoring

In [8]:
from sagemaker.clarify import (
    BiasConfig,
    DataConfig,
    ModelConfig,
    ModelPredictedLabelConfig,
    SHAPConfig,
)

In [9]:
model_bias_baselining_job_result_uri = f"s3://{bucket}/{prefix}/model_bias"

In [10]:
validation_file = "s3://" + bucket + "/" + prefix + "/validation/validation_data.csv"
train_file = "s3://" + bucket + "/" + prefix + "/train/train_data.csv"
batch_file = "s3://" + bucket + "/" + prefix + "/batch/batch_data.csv"

In [11]:
from tqdm import tqdm

limit = 600  
i = 0
with open("validation_data.csv", "r") as f:
    for row in tqdm(f):
        if(i == 0):
            i+=1
            continue
        (label, input_cols) = row.split(",", 1)
        probability = float(predictor.predict(input_cols))
        i += 1
        if i > limit:
            break
        sleep(0.1)
print("Done!")

600it [01:09,  8.68it/s]

Done!





In [12]:
model_bias_monitor = ModelBiasMonitor(
    role=role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    volume_size_in_gb=20,
    max_runtime_in_seconds=1800,
    sagemaker_session=session,
)

model_bias_data_config = DataConfig(
    s3_data_input_path= validation_file,
    s3_output_path=model_bias_baselining_job_result_uri,
    label = "price"
)

In [13]:
model_bias_config = BiasConfig(
    label_values_or_threshold=[16890.124046434496], #The mean price
    facet_name="year",
    facet_values_or_threshold = [2010]
)

In [14]:
model_config = ModelConfig(
    endpoint_name =endpoint_name,
    instance_count=1,
    instance_type="ml.m5.xlarge",
)

In [15]:
import pandas as pd

In [16]:
baseline_job_bias = model_bias_monitor.suggest_baseline(
    model_config=model_config,
    data_config=model_bias_data_config,
    bias_config=model_bias_config,
)
baseline_job_bias.wait(logs=True)

INFO:sagemaker:Creating processing-job with name baseline-suggestion-job-2024-06-23-06-03-14-521


[34mINFO:sagemaker-clarify-processing:Starting SageMaker Clarify Processing job[0m
[34mINFO:analyzer.data_loading.data_loader_util:Analysis config path: /opt/ml/processing/input/config/analysis_config.json[0m
[34mINFO:analyzer.data_loading.data_loader_util:Analysis result path: /opt/ml/processing/output[0m
[34mINFO:analyzer.data_loading.data_loader_util:This host is algo-1.[0m
[34mINFO:analyzer.data_loading.data_loader_util:This host is the leader.[0m
[34mINFO:analyzer.data_loading.data_loader_util:Number of hosts in the cluster is 1.[0m
[34mINFO:sagemaker-clarify-processing:Running Python / Pandas based analyzer.[0m
[34mINFO:analyzer.data_loading.data_loader_factory:Dataset type: text/csv uri: /opt/ml/processing/input/data[0m
  df = df.append(df_tmp, ignore_index=True)[0m
  df = df.append(df_tmp, ignore_index=True)[0m
[34mINFO:sagemaker-clarify-processing:Loading dataset...[0m
  df = df.append(df_tmp, ignore_index=True)[0m
[34mINFO:sagemaker-clarify-processing:Lo

### Generate Fake Ground Truth

The method to do this will be using a normal distubation using the mean and standard deviation of the price. 

In [17]:
ground_truth_upload_path = "s3://" + bucket + "/" + prefix + "groundtruth"

In [18]:
import numpy as np
df = pd.read_csv(validation_file)
price = df.price
price_mean = price.mean()
price_std = price.std()
def ground_truth_with_id(inference_id):
    GroundTruthDistrubation = np.random.normal(price_mean, price_std, 1)[0]
    return {
        "groundTruthData": {
            "data": GroundTruthDistrubation,
            "encoding": "CSV",
        },
        "eventMetadata": {
            "eventId": str(inference_id),
        },
        "eventVersion": "0",
    }

def upload_ground_truth(records, upload_time):
    fake_records = [json.dumps(r) for r in records]
    data_to_upload = "\n".join(fake_records)
    target_s3_uri = f"{ground_truth_upload_path}/{upload_time:%Y/%m/%d/%H/%M%S}.jsonl"
    print(f"Uploading {len(fake_records)} records to", target_s3_uri)
    S3Uploader.upload_string_as_file_body(data_to_upload, target_s3_uri)


severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


In [19]:
NUM_GROUND_TRUTH_RECORDS = len(df)  

def generate_fake_ground_truth_forever():
    j = 0
    while 1:
        fake_records = [ground_truth_with_id(i) for i in range(NUM_GROUND_TRUTH_RECORDS)]
        upload_ground_truth(fake_records, datetime.utcnow())
        j = (j + 1) % 5
        sleep(60 * 60)  



gt_thread = Thread(target=generate_fake_ground_truth_forever)
gt_thread.start()

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


Uploading 10657 records to s3://sagemaker-us-east-1-122442915072/xgboost-finalgroundtruth/2024/06/23/06/0841.jsonl


INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


Uploading 10657 records to s3://sagemaker-us-east-1-122442915072/xgboost-finalgroundtruth/2024/06/23/07/0842.jsonl


INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


Uploading 10657 records to s3://sagemaker-us-east-1-122442915072/xgboost-finalgroundtruth/2024/06/23/09/0843.jsonl


In [20]:
from sagemaker.model_monitor import (
    BiasAnalysisConfig,
    CronExpressionGenerator,
    DataCaptureConfig,
    EndpointInput,
    ExplainabilityAnalysisConfig,
    ModelBiasMonitor,
    ModelExplainabilityMonitor,
)

model_bias_analysis_config = BiasAnalysisConfig(
    model_bias_config,
    label="price",
)

In [21]:
model_bias_monitor.create_monitoring_schedule(
    analysis_config=model_bias_analysis_config,
    output_s3_uri="s3://" + bucket + "/" + prefix + "monitor",
    endpoint_input=EndpointInput(
        endpoint_name=endpoint_name,
        destination="/opt/ml/processing/input/endpoint",
    ),
    ground_truth_input=ground_truth_upload_path,
    schedule_cron_expression=CronExpressionGenerator.hourly(),
    enable_cloudwatch_metrics=True,
)

INFO:sagemaker.model_monitor.clarify_model_monitoring:Uploading analysis config to {s3_uri}.
INFO:sagemaker.model_monitor.model_monitoring:Creating Monitoring Schedule with name: monitoring-schedule-2024-06-23-06-08-41-063


In [22]:
model_bias_monitor.describe_schedule()

{'MonitoringScheduleArn': 'arn:aws:sagemaker:us-east-1:122442915072:monitoring-schedule/monitoring-schedule-2024-06-23-06-08-41-063',
 'MonitoringScheduleName': 'monitoring-schedule-2024-06-23-06-08-41-063',
 'MonitoringScheduleStatus': 'Pending',
 'MonitoringType': 'ModelBias',
 'CreationTime': datetime.datetime(2024, 6, 23, 6, 8, 41, 763000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2024, 6, 23, 6, 8, 41, 784000, tzinfo=tzlocal()),
 'MonitoringScheduleConfig': {'ScheduleConfig': {'ScheduleExpression': 'cron(0 * ? * * *)'},
  'MonitoringJobDefinitionName': 'model-bias-job-definition-2024-06-23-06-08-41-063',
  'MonitoringType': 'ModelBias'},
 'EndpointName': 'xgb-Final-project-2024-06-23-0558',
 'ResponseMetadata': {'RequestId': '046852d7-9a67-418f-94f5-9a9eb0cc3b38',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '046852d7-9a67-418f-94f5-9a9eb0cc3b38',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '588',
   'date': 'Sun, 23 Jun 

In [None]:
executions = model_bias_monitor.list_executions()
while True:
    execution = model_bias_monitor.describe_schedule().get(
        "LastMonitoringExecutionSummary"
    )
    if execution:
        break
    print(".", end="", flush=True)
    sleep(60)

In [None]:
latest_execution = model_bias_monitor.list_executions()[-1]
report_uri = latest_execution.describe()["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
report_uri

### CI/CD code 

In [None]:
## To Create CodeCommit Repository

In [None]:
!aws codecommit create-repository --repository-name CarPricePrediction


In [None]:
## To Create CodeBuild Project

In [None]:
!aws codebuild create-project --name CarPricePredictionBuild \
  --source type=CODECOMMIT,location=CarPricePrediction \
  --artifacts type=NO_ARTIFACTS \
  --environment type=LINUX_CONTAINER,computeType=BUILD_GENERAL1_SMALL,image=aws/codebuild/standard:4.0 \
  --service-role arn:aws:iam::364723530364:role/CodeBuildServiceRole


In [None]:
## To Create CodePipeline

In [None]:
!aws codepipeline create-pipeline --pipeline file://pipeline.json


In [None]:
## Create Lambda Function for Deployment:

In [None]:
import boto3

def lambda_handler(event, context):
    client = boto3.client('sagemaker')
    endpoint_name = 'xgb-Final-project'
    
    response = client.create_endpoint(
        EndpointName=endpoint_name,
        EndpointConfigName='CarPricePredictionEndpointConfig'
    )
    
    return response


In [None]:
!aws cloudformation deploy --codepipeline.yml --stack-name MySageMakerCICDPipeline --capabilities CAPABILITY_IAM
