In [1]:
%pip install sagemaker --upgrade --quiet 

Note: you may need to restart the kernel to use updated packages.


In [2]:
#%pip install -q  xgboost==1.3.1 pandas==1.0.5

In [3]:
import pandas as pd
import boto3
import sagemaker
import json
import joblib
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.tuner import (
    IntegerParameter,
    ContinuousParameter,
    HyperparameterTuner
)
from sagemaker.inputs import TrainingInput
from sagemaker.image_uris import retrieve
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import CSVDeserializer

# Setting SageMaker variables
sess = sagemaker.Session()
write_bucket = sess.default_bucket()
write_prefix = "fraud-detect-demo"

region = sess.boto_region_name
s3_client = boto3.client("s3", region_name=region)

sagemaker_role = sagemaker.get_execution_role()
sagemaker_client = boto3.client("sagemaker")
read_bucket = "sagemaker-sample-files"
read_prefix = "datasets/tabular/synthetic_automobile_claims" 


# Setting S3 location for read and write operations
train_data_key = f"{read_prefix}/train.csv"
test_data_key = f"{read_prefix}/test.csv"
validation_data_key = f"{read_prefix}/validation.csv"
model_key = f"{write_prefix}/model"
output_key = f"{write_prefix}/output"


train_data_uri = f"s3://{read_bucket}/{train_data_key}"
test_data_uri = f"s3://{read_bucket}/{test_data_key}"
validation_data_uri = f"s3://{read_bucket}/{validation_data_key}"
model_uri = f"s3://{write_bucket}/{model_key}"
output_uri = f"s3://{write_bucket}/{output_key}"
estimator_output_uri = f"s3://{write_bucket}/{write_prefix}/training_jobs"
bias_report_output_uri = f"s3://{write_bucket}/{write_prefix}/clarify-output/bias"
explainability_report_output_uri = f"s3://{write_bucket}/{write_prefix}/clarify-output/explainability"

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [4]:
sagemaker_role

'arn:aws:iam::425766660658:role/Comprehend-Immersion-day-SageMakerIamRole-qTWftlkNGEjg'

In [5]:
print(write_bucket)

sagemaker-us-east-1-425766660658


In [6]:
print(train_data_key)
print(test_data_key)
print(validation_data_key)
print(model_key)
print(output_key)

datasets/tabular/synthetic_automobile_claims/train.csv
datasets/tabular/synthetic_automobile_claims/test.csv
datasets/tabular/synthetic_automobile_claims/validation.csv
fraud-detect-demo/model
fraud-detect-demo/output


In [7]:
print(train_data_uri)
print(test_data_uri)
print(validation_data_uri)
print(model_uri)
print(output_uri)
print(estimator_output_uri)
print(bias_report_output_uri)
print(explainability_report_output_uri)

s3://sagemaker-sample-files/datasets/tabular/synthetic_automobile_claims/train.csv
s3://sagemaker-sample-files/datasets/tabular/synthetic_automobile_claims/test.csv
s3://sagemaker-sample-files/datasets/tabular/synthetic_automobile_claims/validation.csv
s3://sagemaker-us-east-1-425766660658/fraud-detect-demo/model
s3://sagemaker-us-east-1-425766660658/fraud-detect-demo/output
s3://sagemaker-us-east-1-425766660658/fraud-detect-demo/training_jobs
s3://sagemaker-us-east-1-425766660658/fraud-detect-demo/clarify-output/bias
s3://sagemaker-us-east-1-425766660658/fraud-detect-demo/clarify-output/explainability


In [8]:
tuning_job_name_prefix = "xgbtune" 
training_job_name_prefix = "xgbtrain"

xgb_model_name = "fraud-detect-xgb-model"
endpoint_name_prefix = "xgb-fraud-model-dev"
train_instance_count = 1
train_instance_type = "ml.m4.xlarge"
predictor_instance_count = 1
predictor_instance_type = "ml.m4.xlarge"
clarify_instance_count = 1
clarify_instance_type = "ml.m4.xlarge"

In [9]:
%%writefile xgboost_train.py

import argparse
import os
import joblib
import json
import pandas as pd
import xgboost as xgb
from sklearn.metrics import roc_auc_score

if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    # Hyperparameters and algorithm parameters are described here
    parser.add_argument("--num_round", type=int, default=100)
    parser.add_argument("--max_depth", type=int, default=3)
    parser.add_argument("--eta", type=float, default=0.2)
    parser.add_argument("--subsample", type=float, default=0.9)
    parser.add_argument("--colsample_bytree", type=float, default=0.8)
    parser.add_argument("--objective", type=str, default="binary:logistic")
    parser.add_argument("--eval_metric", type=str, default="auc")
    parser.add_argument("--nfold", type=int, default=3)
    parser.add_argument("--early_stopping_rounds", type=int, default=3)
    

    # SageMaker specific arguments. Defaults are set in the environment variables
    # Location of input training data
    parser.add_argument("--train_data_dir", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    # Location of input validation data
    parser.add_argument("--validation_data_dir", type=str, default=os.environ.get("SM_CHANNEL_VALIDATION"))
    # Location where trained model will be stored. Default set by SageMaker, /opt/ml/model
    parser.add_argument("--model_dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    # Location where model artifacts will be stored. Default set by SageMaker, /opt/ml/output/data
    parser.add_argument("--output_data_dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR"))
    
    args = parser.parse_args()

    data_train = pd.read_csv(f"{args.train_data_dir}/train.csv")
    train = data_train.drop("fraud", axis=1)
    label_train = pd.DataFrame(data_train["fraud"])
    dtrain = xgb.DMatrix(train, label=label_train)
    
    
    data_validation = pd.read_csv(f"{args.validation_data_dir}/validation.csv")
    validation = data_validation.drop("fraud", axis=1)
    label_validation = pd.DataFrame(data_validation["fraud"])
    dvalidation = xgb.DMatrix(validation, label=label_validation)

    params = {"max_depth": args.max_depth,
              "eta": args.eta,
              "objective": args.objective,
              "subsample" : args.subsample,
              "colsample_bytree":args.colsample_bytree
             }
    
    num_boost_round = args.num_round
    nfold = args.nfold
    early_stopping_rounds = args.early_stopping_rounds
    
    cv_results = xgb.cv(
        params=params,
        dtrain=dtrain,
        num_boost_round=num_boost_round,
        nfold=nfold,
        early_stopping_rounds=early_stopping_rounds,
        metrics=["auc"],
        seed=42,
    )
    
    model = xgb.train(params=params, dtrain=dtrain, num_boost_round=len(cv_results))
    
    train_pred = model.predict(dtrain)
    validation_pred = model.predict(dvalidation)
    
    train_auc = roc_auc_score(label_train, train_pred)
    validation_auc = roc_auc_score(label_validation, validation_pred)
    
    print(f"[0]#011train-auc:{train_auc:.2f}")
    print(f"[0]#011validation-auc:{validation_auc:.2f}")

    metrics_data = {"hyperparameters" : params,
                    "binary_classification_metrics": {"validation:auc": {"value": validation_auc},
                                                      "train:auc": {"value": train_auc}
                                                     }
                   }
              
    # Save the evaluation metrics to the location specified by output_data_dir
    metrics_location = args.output_data_dir + "/metrics.json"
    
    # Save the model to the location specified by model_dir
    model_location = args.model_dir + "/xgboost-model"

    with open(metrics_location, "w") as f:
        json.dump(metrics_data, f)

    with open(model_location, "wb") as f:
        joblib.dump(model, f)

Overwriting xgboost_train.py


In [10]:
# Setting ranges of hyperparameters to be tuned
hyperparameter_ranges = {
    "eta": ContinuousParameter(0, 1),
    "subsample": ContinuousParameter(0.7, 0.95),
    "colsample_bytree": ContinuousParameter(0.7, 0.95),
    "max_depth": IntegerParameter(1, 5)
}

In [11]:
# SageMaker estimator

# Set static hyperparameters that will not be tuned
static_hyperparams = {  
                        "eval_metric" : "auc",
                        "objective": "binary:logistic",
                        "num_round": "5"
                      }

xgb_estimator = XGBoost(
                        entry_point="xgboost_train.py",
                        output_path=estimator_output_uri,
                        code_location=estimator_output_uri,
                        hyperparameters=static_hyperparams,
                        role=sagemaker_role,
                        instance_count=train_instance_count,
                        instance_type=train_instance_type,
                        framework_version="1.3-1",
                        base_job_name=training_job_name_prefix
                    )

In [15]:
train_instance_type

'ml.m4.xlarge'

In [12]:
objective_metric_name = "validation:auc"

# Setting up tuner object
tuner_config_dict = {
                     "estimator" : xgb_estimator,
                     "max_jobs" : 1,
                     "max_parallel_jobs" : 2,
                     "objective_metric_name" : objective_metric_name,
                     "hyperparameter_ranges" : hyperparameter_ranges,
                     "base_tuning_job_name" : tuning_job_name_prefix,
                     "strategy" : "Random"
                    }
tuner = HyperparameterTuner(**tuner_config_dict)

In [17]:
# Setting the input channels for tuning job
s3_input_train = TrainingInput(
    s3_data="s3://{}/{}".format(read_bucket, train_data_key), 
    content_type="csv", 
    s3_data_type="S3Prefix"
)


s3_input_validation = (TrainingInput(
    s3_data="s3://{}/{}".format(read_bucket, validation_data_key), 
    content_type="csv", 
    s3_data_type="S3Prefix")
)

tuner.fit(
    inputs={"train": s3_input_train, "validation": s3_input_validation}, 
    include_cls_metadata=False)

tuner.wait()

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


.................................!
!


In [20]:
# Summary of tuning results ordered in descending order of performance
df_tuner = sagemaker.HyperparameterTuningJobAnalytics(tuner.latest_tuning_job.job_name).dataframe()
df_tuner = df_tuner[df_tuner["FinalObjectiveValue"]>-float('inf')].sort_values("FinalObjectiveValue", ascending=False)
df_tuner

Unnamed: 0,colsample_bytree,eta,max_depth,subsample,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
0,0.887207,0.387559,3.0,0.773799,xgbtune-240506-0320-001-495f2150,Completed,0.7,2024-05-06 03:21:12+00:00,2024-05-06 03:23:07+00:00,115.0


In [21]:
sagemaker_role

'arn:aws:iam::425766660658:role/Comprehend-Immersion-day-SageMakerIamRole-qTWftlkNGEjg'

In [None]:
best_train_job_name = tuner.best_training_job()

model_path = estimator_output_uri + '/' + best_train_job_name + '/output/model.tar.gz'
training_image = retrieve(
    framework="xgboost", 
    region=region, 
    version="1.3-1"
)

create_model_config = {
    "model_data":model_path,
    "role": sagemaker_role,
    "image_uri":training_image,
    "name":endpoint_name_prefix,
    "predictor_cls":sagemaker.predictor.Predictor
}

# Create a SageMaker model
model = sagemaker.model.Model(**create_model_config)

# Deploy the best model and get access to a SageMaker Predictor
predictor = model.deploy(initial_instance_count=predictor_instance_count, 
                         instance_type=predictor_instance_type,
                         serializer=CSVSerializer(),
                         deserializer=CSVDeserializer())
print(f"\nModel deployed at endpoint : {model.endpoint_name}")

In [29]:
best_train_job_name

'xgbtune-240506-0320-001-495f2150'

In [24]:
# Sample test data
test_df = pd.read_csv(test_data_uri)
payload = test_df.drop(["fraud"], axis=1).iloc[0].to_list()
print(f"Model predicted score : {float(predictor.predict(payload)[0][0]):.3f}, True label : {test_df['fraud'].iloc[0]}")

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.



Model predicted score : 0.068, True label : 0


In [25]:
test_df

Unnamed: 0,fraud,num_vehicles_involved,num_injuries,num_witnesses,police_report_available,injury_claim,vehicle_claim,total_claim_amount,incident_month,incident_day,...,authorities_contacted_ambulance,policy_state_ca,policy_state_az,policy_state_nv,policy_state_id,policy_state_wa,policy_state_or,customer_gender_other,customer_gender_male,customer_gender_female
0,0,2,0,0,1,58400,14247.766867,72647.766867,7,29,...,0,1,0,0,0,0,0,0,0,1
1,0,2,0,2,0,11500,10675.671347,22175.671347,11,28,...,0,1,0,0,0,0,0,0,0,1
2,0,2,0,0,1,18500,10202.266354,28702.266354,1,6,...,0,1,0,0,0,0,0,0,1,0
3,0,1,0,0,0,16300,9338.348066,25638.348066,9,1,...,0,1,0,0,0,0,0,0,1,0
4,0,2,1,0,0,14700,28145.924994,42845.924994,10,23,...,1,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
518,0,2,3,0,1,15800,18233.436944,34033.436944,7,11,...,0,1,0,0,0,0,0,0,0,1
519,0,3,0,2,1,19500,9085.293835,28585.293835,1,7,...,0,1,0,0,0,0,0,0,1,0
520,0,2,0,3,0,13000,5878.370270,18878.370270,11,5,...,0,1,0,0,0,0,0,0,1,0
521,0,3,1,2,1,3400,21762.452040,25162.452040,10,1,...,0,1,0,0,0,0,0,0,0,1


In [27]:
payload = test_df.drop(["fraud"], axis=1).iloc[100].to_list()
print(f"Model predicted score : {float(predictor.predict(payload)[0][0]):.3f}, True label : {test_df['fraud'].iloc[0]}")

Model predicted score : 0.068, True label : 0


## Invoke Model from Endpoint using the Runtime Sagemaker

* In case you want to use this code to invoke from an Amazon Lambda or EC2 instance.

In [34]:
ENDPOINT_NAME = 'xgb-fraud-model-dev-2024-05-06-03-30-32-457'
runtime= boto3.client('runtime.sagemaker')

event = {
    "data": "2.0,0.0,0.0,0.0,22900.0,17573.98739734717,40473.98739734717,11.0,26.0,1.0,15.0,32.0,59.0,0.0,2.0,750.0,3000.0,3.0,3.0,2013.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0"
}

print("Received event: " + json.dumps(event, indent=2))

data = json.loads(json.dumps(event))
payload = data['data']
print(payload)



Received event: {
  "data": "2.0,0.0,0.0,0.0,22900.0,17573.98739734717,40473.98739734717,11.0,26.0,1.0,15.0,32.0,59.0,0.0,2.0,750.0,3000.0,3.0,3.0,2013.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0"
}
2.0,0.0,0.0,0.0,22900.0,17573.98739734717,40473.98739734717,11.0,26.0,1.0,15.0,32.0,59.0,0.0,2.0,750.0,3000.0,3.0,3.0,2013.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [45]:
response = runtime.invoke_endpoint(
    EndpointName=ENDPOINT_NAME,
    ContentType='text/csv',
    Body=payload
)
print(response)

{'ResponseMetadata': {'RequestId': '317542ac-496a-4941-b7e3-3110068fd1b5', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '317542ac-496a-4941-b7e3-3110068fd1b5', 'x-amzn-invoked-production-variant': 'AllTraffic', 'date': 'Mon, 06 May 2024 04:02:47 GMT', 'content-type': 'text/csv; charset=utf-8', 'content-length': '20', 'connection': 'keep-alive'}, 'RetryAttempts': 0}, 'ContentType': 'text/csv; charset=utf-8', 'InvokedProductionVariant': 'AllTraffic', 'Body': <botocore.response.StreamingBody object at 0x7efe94671000>}


In [46]:
result = json.loads(response['Body'].read().decode())
print(result)

0.06792672723531723


## Delete Endpoint

In [28]:
predictor.delete_endpoint()

In [47]:
endpoint_name_prefix

'xgb-fraud-model-dev'