In [2]:
import datetime
import time
import tarfile

import boto3
import pandas as pd
import numpy as np
from sagemaker import get_execution_role
import sagemaker
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing

sm_boto3 = boto3.client("sagemaker")

sess = sagemaker.Session()

region = sess.boto_session.region_name

bucket = sess.default_bucket()  # this could also be a hard-coded bucket name

print("Using bucket " + bucket)



Using bucket sagemaker-us-east-2-348722577408


In [3]:
# we use the Boston housing dataset
data = fetch_california_housing()

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    data.data, data.target, test_size=0.25, random_state=42
)

trainX = pd.DataFrame(X_train, columns=data.feature_names)
trainX["target"] = y_train

testX = pd.DataFrame(X_test, columns=data.feature_names)
testX["target"] = y_test



In [5]:
trainX.head()

trainX.to_csv("california_housing_train.csv")
testX.to_csv("california_housing_test.csv")

# send data to S3. SageMaker will take training data from s3
trainpath = sess.upload_data(
    path="california_housing_train.csv", bucket=bucket, key_prefix="sagemaker/sklearncontainer"
)

testpath = sess.upload_data(
    path="california_housing_test.csv", bucket=bucket, key_prefix="sagemaker/sklearncontainer"
)



In [8]:
%%writefile script.py

import argparse
import joblib
import os

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor


# inference functions ---------------
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf


if __name__ == "__main__":

    print("extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    # to simplify the demo we don't use all sklearn RandomForest hyperparameters
    parser.add_argument("--n-estimators", type=int, default=10)
    parser.add_argument("--min-samples-leaf", type=int, default=3)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="california_housing_train.csv")
    parser.add_argument("--test-file", type=str, default="california_housing_test.csv")
    parser.add_argument(
        "--features", type=str
    )  # in this script we ask user to explicitly name features
    parser.add_argument(
        "--target", type=str
    )  # in this script we ask user to explicitly name the target

    args, _ = parser.parse_known_args()

    print("reading data")
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    print("building training and testing datasets")
    X_train = train_df[args.features.split()]
    X_test = test_df[args.features.split()]
    y_train = train_df[args.target]
    y_test = test_df[args.target]

    # train
    print("training model")
    model = RandomForestRegressor(
        n_estimators=args.n_estimators, min_samples_leaf=args.min_samples_leaf, n_jobs=-1
    )

    model.fit(X_train, y_train)

    # print abs error
    print("validating model")
    abs_err = np.abs(model.predict(X_test) - y_test)

    # print couple perf metrics
    for q in [10, 50, 90]:
        print("AE-at-" + str(q) + "th-percentile: " + str(np.percentile(a=abs_err, q=q)))

    # persist model
    path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, path)
    print("model persisted at " + path)
    print(args.min_samples_leaf)



Overwriting script.py


In [9]:
! python script.py --n-estimators 100 \
                   --min-samples-leaf 2 \
                   --model-dir ./ \
                   --train ./ \
                   --test ./ \
                   --features 'MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude Longitude' \
                   --target target

extracting arguments
reading data
building training and testing datasets
training model
validating model
AE-at-10th-percentile: 0.03090791611399659
AE-at-50th-percentile: 0.2081041777777778
AE-at-90th-percentile: 0.7760247836904759
model persisted at ./model.joblib
2


In [11]:
# We use the Estimator from the SageMaker Python SDK
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point="script.py",
    role=get_execution_role(),
    instance_count=1,
    instance_type="ml.m4.xlarge",#"ml.c5.xlarge",#"ml.m4.xlarge",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="rf-scikit",
    metric_definitions=[{"Name": "median-AE", "Regex": "AE-at-50th-percentile: ([0-9.]+).*$"}],
    hyperparameters={
        "n-estimators": 100,
        "min-samples-leaf": 3,
        "features": "MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude Longitude",
        "target": "target",
    },
)

import time
# AttributeError: module 'time' has no attribute 'clock'
tic = time.perf_counter()

# launch training job, with asynchronous call
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=False)

toc = time.perf_counter()
print(toc - tic)

INFO:sagemaker:Creating training-job with name: rf-scikit-2024-08-06-21-51-55-287


0.5963972769677639


In [12]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact)

from sagemaker.sklearn.model import SKLearnModel

model = SKLearnModel(
    model_data=artifact,
    role=get_execution_role(),
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION,
)

predictor = model.deploy(instance_type="ml.c5.large", initial_instance_count=1)




2024-08-06 21:51:56 Starting - Starting the training job
2024-08-06 21:52:11 Starting - Preparing the instances for training...
2024-08-06 21:52:32 Downloading - Downloading input data.....
2024-08-06 21:53:02 Downloading - Downloading the training image.......
2024-08-06 21:53:43 Training - Training image download completed. Training in progress.....
2024-08-06 21:54:08 Uploading - Uploading generated training model..
2024-08-06 21:54:21 Completed - Training job completed
Model artifact persisted at s3://sagemaker-us-east-2-348722577408/rf-scikit-2024-08-06-21-51-55-287/output/model.tar.gz


INFO:sagemaker:Creating model with name: sagemaker-scikit-learn-2024-08-06-21-54-26-103
INFO:sagemaker:Creating endpoint-config with name sagemaker-scikit-learn-2024-08-06-21-54-26-748
INFO:sagemaker:Creating endpoint with name sagemaker-scikit-learn-2024-08-06-21-54-26-748


-----!

In [15]:
for i in range(100):
    # invoke endpoint
    print(predictor.predict(testX[data.feature_names]))
    

[0.50000471 0.75208354 4.8725301  ... 1.26443213 2.91494589 4.0914444 ]
[0.50000471 0.75208354 4.8725301  ... 1.26443213 2.91494589 4.0914444 ]
[0.50000471 0.75208354 4.8725301  ... 1.26443213 2.91494589 4.0914444 ]
[0.50000471 0.75208354 4.8725301  ... 1.26443213 2.91494589 4.0914444 ]
[0.50000471 0.75208354 4.8725301  ... 1.26443213 2.91494589 4.0914444 ]
[0.50000471 0.75208354 4.8725301  ... 1.26443213 2.91494589 4.0914444 ]
[0.50000471 0.75208354 4.8725301  ... 1.26443213 2.91494589 4.0914444 ]
[0.50000471 0.75208354 4.8725301  ... 1.26443213 2.91494589 4.0914444 ]
[0.50000471 0.75208354 4.8725301  ... 1.26443213 2.91494589 4.0914444 ]
[0.50000471 0.75208354 4.8725301  ... 1.26443213 2.91494589 4.0914444 ]
[0.50000471 0.75208354 4.8725301  ... 1.26443213 2.91494589 4.0914444 ]
[0.50000471 0.75208354 4.8725301  ... 1.26443213 2.91494589 4.0914444 ]
[0.50000471 0.75208354 4.8725301  ... 1.26443213 2.91494589 4.0914444 ]
[0.50000471 0.75208354 4.8725301  ... 1.26443213 2.91494589 4.09

In [16]:
sm_boto3.delete_endpoint(EndpointName=predictor.endpoint_name)

{'ResponseMetadata': {'RequestId': 'ddf2c20e-6220-4b17-9189-4afe1ab9cff2',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'ddf2c20e-6220-4b17-9189-4afe1ab9cff2',
   'content-type': 'application/x-amz-json-1.1',
   'date': 'Tue, 06 Aug 2024 22:09:32 GMT',
   'content-length': '0'},
  'RetryAttempts': 0}}