In [4]:
import os
import io
import boto3
import pandas as pd
import sagemaker
from sagemaker.serializers import CSVSerializer
from sagemaker.debugger import Rule, rule_configs
from sagemaker.session import TrainingInput

In [5]:
region = sagemaker.Session().boto_region_name
print("AWS Region: {}".format(region))

role = sagemaker.get_execution_role()
print("RoleArn: {}".format(role))

AWS Region: us-east-2
RoleArn: arn:aws:iam::257056996471:role/cb-sagemaker


### Build Yearly DDOS service_types + Demographics

In [18]:
prefix = 'sagemaker'
model_name = 'xgboost_model_v7_yearly_stddos_dem'
s3_model_output_location ='s3://{}/{}/{}'.format('cb-analytics-exports-us-east-2-prd', prefix, model_name)
inputs_bucket_name = 'cb-analytics-us-east-2-prd'

In [19]:
container=sagemaker.image_uris.retrieve("xgboost", region, "1.2-1")
print(container)

xgb_model=sagemaker.estimator.Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type='ml.m4.xlarge',
    volume_size=10,
    output_path=s3_model_output_location,
    sagemaker_session=sagemaker.Session(),
    rules=[Rule.sagemaker(rule_configs.create_xgboost_report())]
)

257758044811.dkr.ecr.us-east-2.amazonaws.com/sagemaker-xgboost:1.2-1


In [20]:
xgb_model.set_hyperparameters(
    max_depth = 6,
    objective = "reg:squarederror",
    num_round = 100,
    alpha = 0.5, # l1
    eta = 0.3, # step size
#     lambda=1, # l2
    gamma = 10 # min gain for split
)

In [25]:
train_input = TrainingInput(
    "s3://{}/{}/{}".format(
        inputs_bucket_name,
        prefix,
        "yearly_st_tc_dem/train.csv"),
    content_type="csv"
)
validation_input = TrainingInput(
    "s3://{}/{}/{}".format(inputs_bucket_name,
                           prefix,
                           "yearly_st_tc_dem/val.csv"),
    content_type="csv"
)

In [26]:
xgb_model.fit({"train": train_input, "validation": validation_input}, wait=True)

2022-02-25 21:15:17 Starting - Starting the training job...
2022-02-25 21:15:43 Starting - Preparing the instances for trainingCreateXgboostReport: InProgress
ProfilerReport-1645823717: InProgress
.........
2022-02-25 21:17:17 Downloading - Downloading input data...
2022-02-25 21:17:42 Training - Downloading the training image......
2022-02-25 21:18:48 Training - Training image download completed. Training in progress...[34m[2022-02-25 21:18:51.607 ip-10-0-252-143.us-east-2.compute.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delim

In [None]:
rule_output_path = xgb_model.output_path + "/" + xgb_model.latest_training_job.name + "/rule-output"
! aws s3 ls {rule_output_path} --recursive

In [None]:
! aws s3 cp {rule_output_path} ./ --recursive

In [None]:
from IPython.display import FileLink, FileLinks
display("Click link below to view the XGBoost Training report", FileLink("CreateXgboostReport/xgboost_report.html"))

In [None]:
profiler_report_name = [rule["RuleConfigurationName"] 
                        for rule in xgb_model.latest_training_job.rule_job_summary() 
                        if "Profiler" in rule["RuleConfigurationName"]][0]
profiler_report_name
display("Click link below to view the profiler report", FileLink(profiler_report_name+"/profiler-output/profiler-report.html"))

In [None]:
xgb_model.model_data

In [None]:

xgb_predictor=xgb_model.deploy(
    initial_instance_count=1,
    instance_type='ml.t2.medium',
    serializer=CSVSerializer()
)

In [None]:
xgb_predictor.endpoint_name