In [1]:
import pandas as pd

dataset = pd.read_csv("housing.csv")

In [2]:
print(dataset.shape)
dataset[:5]

(506, 13)


Unnamed: 0,crim,zn,indus,chas,nox,age,rm,dis,rad,tax,ptratio,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,5.33,36.2


In [3]:
# Move 'medv' column to front
dataset = pd.concat([dataset["medv"], dataset.drop(["medv"], axis=1)], axis=1)

In [4]:
from sklearn.model_selection import train_test_split

training_dataset, validation_dataset = train_test_split(dataset, test_size=0.1)

print(training_dataset.shape)
print(validation_dataset.shape)

(455, 13)
(51, 13)


In [5]:
training_dataset.to_csv("training_dataset.csv", index=False, header=False)
validation_dataset.to_csv("validation_dataset.csv", index=False, header=False)

In [6]:
import sagemaker

print(sagemaker.__version__)

sess = sagemaker.Session()
bucket = sess.default_bucket()

prefix = "boston-housing"
training_data_path = sess.upload_data(
    path="training_dataset.csv", key_prefix=prefix + "/input/training"
)
validation_data_path = sess.upload_data(
    path="validation_dataset.csv", key_prefix=prefix + "/input/validation"
)

print(training_data_path)
print(validation_data_path)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
2.203.0
s3://sagemaker-us-east-1-043612393875/boston-housing/input/training/training_dataset.csv
s3://sagemaker-us-east-1-043612393875/boston-housing/input/validation/validation_dataset.csv


In [8]:
from sagemaker.estimator import Estimator
import boto3
from sagemaker import image_uris

region = boto3.Session().region_name
container = image_uris.retrieve("xgboost", region, "1.7-1")

print(container)


role = sagemaker.get_execution_role()

ll_estimator = Estimator(
    container,
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    output_path="s3://{}/{}/output".format(bucket, prefix),
)
hyperparameters = {
    "max_depth": "5",
    "eta": "0.2",
    "gamma": "4",
    "min_child_weight": "6",
    "subsample": "0.7",
    "objective": "reg:squarederror",
    "num_round": "50",
}

ll_estimator.set_hyperparameters(**hyperparameters)

683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.7-1


In [9]:
content_type = "libsvm"

training_data_channel = sagemaker.TrainingInput(
    s3_data=training_data_path, content_type="text/csv"
)
validation_data_channel = sagemaker.TrainingInput(
    s3_data=validation_data_path, content_type="text/csv"
)

ll_data = {"train": training_data_channel, "validation": validation_data_channel}

In [10]:
ll_estimator.fit(ll_data)

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-01-07-21-39-03-222


2024-01-07 21:39:03 Starting - Starting the training job...
2024-01-07 21:39:17 Starting - Preparing the instances for training......
2024-01-07 21:40:27 Downloading - Downloading input data......
2024-01-07 21:41:12 Downloading - Downloading the training image......
2024-01-07 21:42:28 Training - Training image download completed. Training in progress.
2024-01-07 21:42:28 Uploading - Uploading generated training model[34m[2024-01-07 21:42:21.172 ip-10-0-211-240.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2024-01-07 21:42:21.194 ip-10-0-211-240.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2024-01-07:21:42:21:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2024-01-07:21:42:21:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34m[2024-01-07:21:42:21:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2024

In [11]:
%%bash -s "$ll_estimator.output_path"
aws s3 ls --recursive $1

2024-01-07 21:42:22          0 boston-housing/output/sagemaker-xgboost-2024-01-07-21-39-03-222/debug-output/claim.smd
2024-01-07 21:42:23       6276 boston-housing/output/sagemaker-xgboost-2024-01-07-21-39-03-222/debug-output/collections/000000000/worker_0_collections.json
2024-01-07 21:42:22        218 boston-housing/output/sagemaker-xgboost-2024-01-07-21-39-03-222/debug-output/events/000000000000/000000000000_worker_0.tfevents
2024-01-07 21:42:22        224 boston-housing/output/sagemaker-xgboost-2024-01-07-21-39-03-222/debug-output/events/000000000010/000000000010_worker_0.tfevents
2024-01-07 21:42:22        224 boston-housing/output/sagemaker-xgboost-2024-01-07-21-39-03-222/debug-output/events/000000000020/000000000020_worker_0.tfevents
2024-01-07 21:42:23        224 boston-housing/output/sagemaker-xgboost-2024-01-07-21-39-03-222/debug-output/events/000000000030/000000000030_worker_0.tfevents
2024-01-07 21:42:22        224 boston-housing/output/sagemaker-xgboost-2024-01-07-21-39-03

In [12]:
from time import strftime, gmtime

timestamp = strftime("%d-%H-%M-%S", gmtime())

endpoint_name = "xgboost-demo-" + timestamp
print(endpoint_name)

xgboost-learner-demo-07-21-43-18


In [13]:
ll_predictor = ll_estimator.deploy(
    endpoint_name=endpoint_name, initial_instance_count=1, instance_type="ml.t2.medium"
)

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-01-07-21-43-18-335
INFO:sagemaker:Creating endpoint-config with name xgboost-learner-demo-07-21-43-18
INFO:sagemaker:Creating endpoint with name xgboost-learner-demo-07-21-43-18


--------!

In [14]:
test_sample = "0.00632,18.00,2.310,0,0.5380,6.5750,65.20,4.0900,1,296.0,15.30,4.98"

In [15]:
# ll_predictor.content_type = 'text/csv'
ll_predictor.serializer = sagemaker.serializers.CSVSerializer()
ll_predictor.deserializer = sagemaker.deserializers.CSVDeserializer()

response = ll_predictor.predict(test_sample)
print(response)

[['24.39341926574707']]


In [16]:
test_samples = [
    "0.00632,18.00,2.310,0,0.5380,6.5750,65.20,4.0900,1,296.0,15.30,4.98",
    "0.02731,0.00,7.070,0,0.4690,6.4210,78.90,4.9671,2,242.0,17.80,9.14",
]

response = ll_predictor.predict(test_samples)
print(response)
print(ll_predictor.endpoint_name)

[['24.39341926574707'], ['22.241615295410156']]
xgboost-learner-demo-07-21-43-18


In [17]:
runtime = boto3.Session().client(service_name="runtime.sagemaker")

response = runtime.invoke_endpoint(
    EndpointName=endpoint_name, ContentType="text/csv", Body=test_sample
)

print(response["Body"].read())

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


b'24.39341926574707\n'


In [18]:
ll_predictor.delete_endpoint()

INFO:sagemaker:Deleting endpoint configuration with name: xgboost-learner-demo-07-21-43-18
INFO:sagemaker:Deleting endpoint with name: xgboost-learner-demo-07-21-43-18
