In [1]:
!pip install --upgrade sagemaker



In [2]:
import os
import boto3
import copy
import time
from time import gmtime, strftime, sleep
import sagemaker
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.image_uris import retrieve
from pprint import pprint
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


# #1 - Initial Setup

In [3]:
# Configuration for SageMaker
role = get_execution_role()
region = boto3.Session().region_name
sess = sagemaker.Session()

# Configuration for S3
bucket_name = "training-models-on-amazon-sagemaker"
prefix = 'mnist/xgboost'
bucket_path = f"s3://{bucket_name}/{prefix}"

# Initialize SageMaker
sm = boto3.client('sagemaker')

# Function to save datasets in libsvm format
def save_as_libsvm(X, y, file_name):
    with open(file_name, 'w') as f:
        for i in range(len(X)):
            line = str(int(y[i])) + ' '
            line += ' '.join([f'{j + 1}:{X[i][j]}' for j in range(len(X[i]))])
            f.write(line + '\n')

            
# Upload local files to S3 with bucket check
def upload_to_s3(local_file, s3_path):
    s3_client = boto3.client('s3')
    
    # Check if the bucket exists, and create it if it doesn't
    try:
        s3_client.head_bucket(Bucket=bucket_name)
    except:
        s3_client.create_bucket(
            Bucket=bucket_name,
            CreateBucketConfiguration={
                'LocationConstraint': region
            }
        )
    
    # Upload the file to S3
    s3_client.upload_file(local_file, bucket_name, s3_path)
    print(f"Uploaded {local_file} to s3://{bucket_name}/{s3_path}")

    
# Training and validation data preparation
digits = datasets.load_digits()
X = StandardScaler().fit_transform(digits.data)
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save data in libsvm format
save_as_libsvm(X_train, y_train, 'train.libsvm')
save_as_libsvm(X_test, y_test, 'validation.libsvm')

# Upload to S3
upload_to_s3('train.libsvm', f'{prefix}/data/train.libsvm')
upload_to_s3('validation.libsvm', f'{prefix}/data/validation.libsvm')

Uploaded train.libsvm to s3://training-models-on-amazon-sagemaker/mnist/xgboost/data/train.libsvm
Uploaded validation.libsvm to s3://training-models-on-amazon-sagemaker/mnist/xgboost/data/validation.libsvm


# #2 - Model Training

In [4]:
# Select Built-in XGBoost Algorithm
container = retrieve(framework="xgboost", region=region, version="1.7-1")

# Train an XGBoost Model
xgb_estimator = sagemaker.estimator.Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type="ml.m5.4xlarge",
    output_path=f's3://{bucket_name}/{prefix}/training-jobs',
    hyperparameters={
        "max_depth": "5",
        "eta": "0.2",
        "gamma": "4",
        "min_child_weight": "6",
        "verbosity": "0",
        "objective": "multi:softmax",
        "num_class": "10",
        "num_round": "10"
    }
)

xgb_estimator.fit({
    'train': TrainingInput(f'{bucket_path}/data/train.libsvm', content_type="libsvm"),
    'validation': TrainingInput(f'{bucket_path}/data/validation.libsvm', content_type="libsvm")
})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-10-22-04-06-51-151


2024-10-22 04:06:52 Starting - Starting the training job...
2024-10-22 04:07:25 Downloading - Downloading input data...
2024-10-22 04:07:35 Downloading - Downloading the training image...
2024-10-22 04:08:21 Training - Training image download completed. Training in progress....
2024-10-22 04:08:56 Uploading - Uploading generated training model.[34m[2024-10-22 04:08:51.130 ip-10-0-185-119.us-east-2.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2024-10-22 04:08:51.151 ip-10-0-185-119.us-east-2.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2024-10-22:04:08:51:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2024-10-22:04:08:51:INFO] Failed to parse hyperparameter objective value multi:softmax to Json.[0m
[34mReturning the value itself[0m
[34m[2024-10-22:04:08:51:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2024-10-22:04:08:51:INFO] Running XGBoost Sagemaker in 

# #3 - Hyperparameter Tuning

In [5]:
# Define parameter options
training_parameters = {
    "AlgorithmSpecification": {"TrainingImage": container, "TrainingInputMode": "File"},
    "RoleArn": role,
    "OutputDataConfig": {"S3OutputPath": f"{bucket_path}/{prefix}/xgboost"},
    "ResourceConfig": {"InstanceCount": 1, "InstanceType": "ml.m5.12xlarge", "VolumeSizeInGB": 5},
    "HyperParameters": {
        "max_depth": "5",
        "eta": "0.2",
        "gamma": "4",
        "min_child_weight": "6",
        "verbosity": "0",
        "objective": "multi:softmax",
        "num_class": "10",
        "num_round": "10",
    },
    "StoppingCondition": {"MaxRuntimeInSeconds": 86400},
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": f"{bucket_path}/{prefix}/train/",
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "ContentType": "libsvm",
            "CompressionType": "None",
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": f"{bucket_path}/{prefix}/validation/",
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "ContentType": "libsvm",
            "CompressionType": "None",
        },
    ],
}

def tune_hyperparameters():
    # Generate tuning job name
    tuning_job_name = f'xgboost-{strftime("%Y-%m-%d-%H-%M-%S", gmtime())}'
    print("Tuning job name is:", tuning_job_name)

    # Define tuning configuration
    tuning_job_config = {
        "ParameterRanges": {
            "CategoricalParameterRanges": [],
            "ContinuousParameterRanges": [
                {"MaxValue": "0.5", "MinValue": "0.1", "Name": "eta"},
                {"MaxValue": "5", "MinValue": "0", "Name": "gamma"},
                {"MaxValue": "120", "MinValue": "0", "Name": "min_child_weight"},
                {"MaxValue": "1", "MinValue": "0.5", "Name": "subsample"},
                {"MaxValue": "2", "MinValue": "0", "Name": "alpha"},
            ],
            "IntegerParameterRanges": [
                {"MaxValue": "10", "MinValue": "0", "Name": "max_depth"},
                {"MaxValue": "50", "MinValue": "1", "Name": "num_round"},
            ],
        },
        "ResourceLimits": {
            "MaxNumberOfTrainingJobs": 6,
            "MaxParallelTrainingJobs": 2,
        },
        "Strategy": "Bayesian",
        "HyperParameterTuningJobObjective": {"MetricName": "validation:merror", "Type": "Minimize"},
    }

    # Use a deep copy of your original training parameters and adjust for tuning
    training_job_definition = copy.deepcopy(training_parameters)
    del training_job_definition["HyperParameters"]

    # Update the S3 output path and static hyperparameters
    training_job_definition["OutputDataConfig"]["S3OutputPath"] = f"{bucket_path}/tuning-jobs"
    training_job_definition["StaticHyperParameters"] = {
        "objective": "multi:softmax",
        "verbosity": "2",
        "num_class": "10",
    }

    # Update ResourceConfig to use ml.m5.4xlarge, which is available on the AWS free tier
    training_job_definition["ResourceConfig"] = {
        "InstanceCount": 1,
        "InstanceType": "ml.m5.4xlarge",
        "VolumeSizeInGB": 5,
    }

    # Set the InputDataConfig to use S3DataSource
    training_job_definition["InputDataConfig"] = [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": f"{bucket_path}/data/train.libsvm",
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "libsvm",
            "CompressionType": "None"
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": f"{bucket_path}/data/validation.libsvm",
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "libsvm",
            "CompressionType": "None"
        }
    ]

    # Start the hyperparameter tuning job
    print(f"Creating a tuning job with name: {tuning_job_name}.")
    sm.create_hyper_parameter_tuning_job(
        HyperParameterTuningJobName=tuning_job_name,
        HyperParameterTuningJobConfig=tuning_job_config,
        TrainingJobDefinition=training_job_definition,
    )

    # Monitor the tuning job status
    status = sm.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=tuning_job_name)[
        "HyperParameterTuningJobStatus"
    ]
    print(status)
    while status != "Completed" and status != "Failed":
        sleep(60)
        status = sm.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=tuning_job_name)[
            "HyperParameterTuningJobStatus"
        ]
        print(status)

    # Analyze tuning results once completed
    print("\nTuning completed. Analyzing results...")
    tuning_result = sm.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=tuning_job_name)
    best_training_job = tuning_result['BestTrainingJob']

    # Print results
    print(f"Best Training Job Name: {best_training_job['TrainingJobName']}")
    print(f"Objective Metric: {best_training_job['FinalHyperParameterTuningJobObjectiveMetric']['MetricName']}")
    print(f"Best Training Job Objective Value: {best_training_job['FinalHyperParameterTuningJobObjectiveMetric']['Value']}")

    # Get and print the best hyperparameters
    print("\nBest Hyperparameters:")
    pprint(best_training_job['TunedHyperParameters'])

    # Get overall tuning job stats
    print("\nTuning Job Statistics:")
    pprint(tuning_result['HyperParameterTuningJobConfig'])

    # List all training jobs with objective metrics
    response = sm.list_training_jobs_for_hyper_parameter_tuning_job(
        HyperParameterTuningJobName=tuning_job_name,
        StatusEquals='Completed',
        SortBy='FinalObjectiveMetricValue',
        SortOrder='Ascending'
    )

    print("\nAll Completed Training Jobs:")
    for training_job in response['TrainingJobSummaries']:
        print(f"Training Job: {training_job['TrainingJobName']}, "
              f"Objective Metric: {training_job['FinalHyperParameterTuningJobObjectiveMetric']['Value']}")

# Execute hyperparameter tuning jobs
tune_hyperparameters()

Tuning job name is: xgboost-2024-10-22-04-10-05
Creating a tuning job with name: xgboost-2024-10-22-04-10-05.
InProgress
InProgress
InProgress
InProgress
Completed

Tuning completed. Analyzing results...
Best Training Job Name: xgboost-2024-10-22-04-10-05-006-634b9805
Objective Metric: validation:merror
Best Training Job Objective Value: 0.09166999906301498

Best Hyperparameters:
{'alpha': '0.9709853413735118',
 'eta': '0.5',
 'gamma': '0.0',
 'max_depth': '10',
 'min_child_weight': '43.30049451720176',
 'num_round': '50',
 'subsample': '0.8942359973880503'}

Tuning Job Statistics:
{'HyperParameterTuningJobObjective': {'MetricName': 'validation:merror',
                                      'Type': 'Minimize'},
 'ParameterRanges': {'CategoricalParameterRanges': [],
                     'ContinuousParameterRanges': [{'MaxValue': '0.5',
                                                    'MinValue': '0.1',
                                                    'Name': 'eta',
               