In [81]:
import pandas as pd
import numpy as np
import boto3

import sagemaker
from sagemaker import get_execution_role
from sagemaker.s3 import S3Uploader

from sklearn.datasets import load_boston
import sklearn.model_selection

In [82]:
# get the basic information
session = sagemaker.Session()
role = get_execution_role()
region = boto3.Session().region_name

bucket = "s3://my-sage-code-samples"
prefix = 'use-case-1'

print("Region: " + region)
print("bucket: " + bucket)

Region: us-east-1
bucket: s3://my-sage-code-samples


In [83]:
# get training information
container = sagemaker.image_uris.retrieve(region=region, framework='xgboost', version='latest')
instance_count = 1
instance_type = 'ml.m4.xlarge'

In [84]:
# Get the data set
dataset = load_boston()
X_bos_pd = pd.DataFrame(dataset.data, columns=dataset.feature_names)
Y_bos_pd = pd.DataFrame(dataset.target)

# We split the dataset into 2/3 training and 1/3 testing sets.
X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X_bos_pd, Y_bos_pd, test_size=0.33)

pd.concat([Y_train, X_train], axis=1).to_csv(('../data/train.csv'), header=False, index=False)
X_test.to_csv(('../data/test.csv'), header=False, index=False)

S3Uploader.upload('../data/train.csv', "s3://my-sage-code-samples/use-case-1/data")
S3Uploader.upload('../data/test.csv', "s3://my-sage-code-samples/use-case-1/data")

's3://my-sage-code-samples/use-case-1/data/test.csv'

In [85]:
#set data channels (input & output)
input_mode = 'File'
train_channel = "s3://my-sage-code-samples/use-case-1/data/train.csv"
output_dir = "s3://my-sage-code-samples/use-case-1/output"

s3_input_train = sagemaker.inputs.TrainingInput(s3_data=train_channel, content_type='csv')

In [86]:
# create an estimator
estimator = sagemaker.estimator.Estimator(image_uri = container,
                                role = role,
                                sagemaker_session = sagemaker.Session(),
                                instance_count = instance_count,
                                instance_type = instance_type,
                                input_mode = input_mode,
                                output_path = output_dir)

In [87]:
# set hyper parameters
estimator.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        objective='reg:linear',
                        early_stopping_rounds=10,
                        num_round=200)

In [88]:
#train the model
estimator.fit({'train' : s3_input_train})

2022-03-03 21:48:00 Starting - Starting the training job...
2022-03-03 21:48:04 Starting - Launching requested ML instancesProfilerReport-1646344080: InProgress
.........
2022-03-03 21:49:42 Starting - Preparing the instances for training.........
2022-03-03 21:51:22 Downloading - Downloading input data
2022-03-03 21:51:22 Training - Downloading the training image...
2022-03-03 21:51:54 Uploading - Uploading generated training model
2022-03-03 21:51:54 Completed - Training job completed
[34mArguments: train[0m
[34m[2022-03-03:21:51:45:INFO] Running standalone xgboost training.[0m
[34m[2022-03-03:21:51:45:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2022-03-03:21:51:45:INFO] File size need to be processed in the node: 0.02mb. Available memory size in the node: 8346.32mb[0m
[34m[2022-03-03:21:51:45:INFO] Determined delimiter of CSV input is ','[0m
[34m[21:51:45] S3DistributionType set as FullyReplicated[0m
[34m[21:51:45] 339x13 matrix with 4407 entries lo

In [78]:
test_location = "s3://my-sage-code-samples/use-case-1/data/test.csv"

xgb_transformer = estimator.transformer(instance_count = 1, 
                                        instance_type = 'ml.m4.xlarge',
                                        output_path = output_dir,
                                        role = role)
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')

...............................[34mArguments: serve[0m
[34m[2022-03-03 21:38:03 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[34m[2022-03-03 21:38:03 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2022-03-03 21:38:03 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2022-03-03 21:38:03 +0000] [22] [INFO] Booting worker with pid: 22[0m
[34m[2022-03-03 21:38:03 +0000] [23] [INFO] Booting worker with pid: 23[0m
[34m[2022-03-03 21:38:03 +0000] [24] [INFO] Booting worker with pid: 24[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2022-03-03:21:38:03:INFO] Model loaded successfully for worker : 22[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2022-03-03:21:38:03:INFO] Model loaded successfully for worker : 23[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2022-03-03:21:38:03:INFO] Model loaded successfully for worker : 24[0m
[34m[2022-03-03 21:38:03 +0000] [25] [INFO] Booting worker with pid: 25[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2022-03

In [80]:
xgb_transformer.wait()

[34mArguments: serve[0m
[34m[2022-03-03 21:38:03 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[34m[2022-03-03 21:38:03 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[35mArguments: serve[0m
[35m[2022-03-03 21:38:03 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[35m[2022-03-03 21:38:03 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2022-03-03 21:38:03 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2022-03-03 21:38:03 +0000] [22] [INFO] Booting worker with pid: 22[0m
[34m[2022-03-03 21:38:03 +0000] [23] [INFO] Booting worker with pid: 23[0m
[34m[2022-03-03 21:38:03 +0000] [24] [INFO] Booting worker with pid: 24[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2022-03-03:21:38:03:INFO] Model loaded successfully for worker : 22[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2022-03-03:21:38:03:INFO] Model loaded successfully for worker : 23[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2022-03-03:21:38:03:INFO] Model loaded successfu