In [4]:
import os
import io
import boto3
import pandas as pd
import sagemaker
from sagemaker.serializers import CSVSerializer
from sagemaker.debugger import Rule, rule_configs
from sagemaker.session import TrainingInput

In [5]:
region = sagemaker.Session().boto_region_name
print("AWS Region: {}".format(region))

role = sagemaker.get_execution_role()
print("RoleArn: {}".format(role))

AWS Region: us-east-2
RoleArn: arn:aws:iam::257056996471:role/cb-sagemaker


In [28]:
tuning_job_config = {
    "ParameterRanges": {
      "CategoricalParameterRanges": [],
      "ContinuousParameterRanges": [
        {
          "MaxValue": "1",
          "MinValue": "0",
          "Name": "eta"
        },
        {
          "MaxValue": "1000",
          "MinValue": "0",
          "Name": "lambda"
        },
        {
          "MaxValue": "1000",
          "MinValue": "0",
          "Name": "alpha"
        },
          {
          "MaxValue": "120",
          "MinValue": "1",
          "Name": "min_child_weight"
        }
      ],
      "IntegerParameterRanges": [
        {
          "MaxValue": "10",
          "MinValue": "1",
          "Name": "max_depth"
        }
      ]
    },
    "ResourceLimits": {
      "MaxNumberOfTrainingJobs": 20,
      "MaxParallelTrainingJobs": 3
    },
    "Strategy": "Bayesian",
    "HyperParameterTuningJobObjective": {
      "MetricName": "validation:mse",
      "Type": "Minimize"
    }
  }

In [34]:
training_image = sagemaker.image_uris.retrieve('xgboost', region, '1.2-1')
prefix = 'sagemaker/data/mom_stddos_dem'
inputs_bucket_name = 'cb-analytics-us-east-2-prd'
outputs_bucket_name = 'cb-analytics-exports-us-east-2-prd'

s3_input_train = 's3://{}/{}/train.csv'.format(inputs_bucket_name, prefix)
s3_input_validation ='s3://{}/{}/val.csv'.format(inputs_bucket_name, prefix)

training_job_definition = {
    "AlgorithmSpecification": {
      "TrainingImage": training_image,
      "TrainingInputMode": "File"
    },
    "InputDataConfig": [
      {
        "ChannelName": "train",
        "CompressionType": "None",
        "ContentType": "csv",
        "DataSource": {
          "S3DataSource": {
            "S3DataDistributionType": "FullyReplicated",
            "S3DataType": "S3Prefix",
            "S3Uri": s3_input_train
          }
        }
      },
      {
        "ChannelName": "validation",
        "CompressionType": "None",
        "ContentType": "csv",
        "DataSource": {
          "S3DataSource": {
            "S3DataDistributionType": "FullyReplicated",
            "S3DataType": "S3Prefix",
            "S3Uri": s3_input_validation
          }
        }
      }
    ],
    "OutputDataConfig": {
      "S3OutputPath": "s3://{}/{}/output".format(outputs_bucket_name,prefix)
    },
    "ResourceConfig": {
      "InstanceCount": 2,
      "InstanceType": "ml.c4.2xlarge",
      "VolumeSizeInGB": 10
    },
    "RoleArn": role,
    "StaticHyperParameters": {
      "eval_metric": "mse",
      "num_round": "100",
      "objective": "reg:squarederror",
      "early_stopping_rounds": "10"
    },
    "StoppingCondition": {
      "MaxRuntimeInSeconds": 43200
    }
}

In [36]:
tuning_job_name = "mom-stddos-dem-xgboost-2"
smclient = boto3.client('sagemaker')
smclient.create_hyper_parameter_tuning_job(HyperParameterTuningJobName = tuning_job_name,
                                           HyperParameterTuningJobConfig = tuning_job_config,
                                           TrainingJobDefinition = training_job_definition)



{'HyperParameterTuningJobArn': 'arn:aws:sagemaker:us-east-2:257056996471:hyper-parameter-tuning-job/mom-stddos-dem-xgboost-2',
 'ResponseMetadata': {'RequestId': '00c59054-24d2-4ffa-8b5f-6c20c8007af7',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '00c59054-24d2-4ffa-8b5f-6c20c8007af7',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '125',
   'date': 'Fri, 25 Feb 2022 21:55:51 GMT'},
  'RetryAttempts': 0}}

# build individual models
### Build Yearly DDOS service_types + Demographics

In [39]:
prefix = 'sagemaker'
model_name = 'xgboost_model_v7_yearly_st_tc_dem'
s3_model_output_location ='s3://{}/{}/{}'.format('cb-analytics-exports-us-east-2-prd', prefix, model_name)
inputs_bucket_name = 'cb-analytics-us-east-2-prd'

In [40]:
container=sagemaker.image_uris.retrieve("xgboost", region, "1.2-1")
print(container)

xgb_model=sagemaker.estimator.Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type='ml.m4.xlarge',
    volume_size=10,
    output_path=s3_model_output_location,
    sagemaker_session=sagemaker.Session(),
    rules=[Rule.sagemaker(rule_configs.create_xgboost_report())]
)

257758044811.dkr.ecr.us-east-2.amazonaws.com/sagemaker-xgboost:1.2-1


In [41]:
xgb_model.set_hyperparameters(
    max_depth = 3,
    objective = "reg:squarederror",
    num_round = 100,
    early_stopping_rounds=10,
    alpha = 100, # l1
    eta = 0.3, # step size
#     lambda=1, # l2
#     gamma = 10 # min gain for split
)

In [42]:
train_input = TrainingInput(
    "s3://{}/{}/{}".format(
        inputs_bucket_name,
        prefix,
        "yearly_st_tc_dem/train.csv"),
    content_type="csv"
)
validation_input = TrainingInput(
    "s3://{}/{}/{}".format(inputs_bucket_name,
                           prefix,
                           "yearly_st_tc_dem/val.csv"),
    content_type="csv"
)

In [43]:
xgb_model.fit({"train": train_input, "validation": validation_input}, wait=True)

2022-02-25 22:16:54 Starting - Starting the training job...
2022-02-25 22:17:20 Starting - Preparing the instances for trainingCreateXgboostReport: InProgress
ProfilerReport-1645827414: InProgress
.........
2022-02-25 22:18:53 Downloading - Downloading input data......
2022-02-25 22:19:53 Training - Downloading the training image.....[34m[2022-02-25 22:20:36.136 ip-10-0-124-207.us-east-2.compute.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m

In [None]:
rule_output_path = xgb_model.output_path + "/" + xgb_model.latest_training_job.name + "/rule-output"
! aws s3 ls {rule_output_path} --recursive

In [None]:
! aws s3 cp {rule_output_path} ./ --recursive

In [None]:
from IPython.display import FileLink, FileLinks
display("Click link below to view the XGBoost Training report", FileLink("CreateXgboostReport/xgboost_report.html"))

In [None]:
profiler_report_name = [rule["RuleConfigurationName"] 
                        for rule in xgb_model.latest_training_job.rule_job_summary() 
                        if "Profiler" in rule["RuleConfigurationName"]][0]
profiler_report_name
display("Click link below to view the profiler report", FileLink(profiler_report_name+"/profiler-output/profiler-report.html"))

In [44]:
xgb_model.model_data

's3://cb-analytics-exports-us-east-2-prd/sagemaker/xgboost_model_v7_yearly_st_tc_dem/sagemaker-xgboost-2022-02-25-22-16-54-592/output/model.tar.gz'

In [45]:
xgb_predictor=xgb_model.deploy(
    initial_instance_count=1,
    instance_type='ml.t2.medium',
    serializer=CSVSerializer()
)

-----------!

In [46]:
xgb_predictor.endpoint_name

'sagemaker-xgboost-2022-02-25-22-26-44-365'

### Build mom DDOS service_types + Demographics HistGradientBoosted Model

In [39]:
prefix = 'sagemaker'
model_name = 'hgbr_model_v1_mom_stddos_dem'
s3_model_output_location ='s3://{}/{}/{}'.format('cb-analytics-exports-us-east-2-prd', prefix, model_name)
inputs_bucket_name = 'cb-analytics-us-east-2-prd'

In [61]:
# Train my estimator
# from sagemaker import SKLearn
from sagemaker.sklearn.estimator import SKLearn
sklearn_estimator = SKLearn(entry_point='train_deploy.py',
                            instance_type='ml.m4.xlarge',
                            role=role,
                            framework_version='0.23-1')
sklearn_estimator.fit({'train': 's3://cb-analytics-us-east-2-prd/sagemaker/data/mom_stddos_dem/train/'})

2022-02-27 02:45:20 Starting - Starting the training job...
2022-02-27 02:45:45 Starting - Preparing the instances for trainingProfilerReport-1645929920: InProgress
.........
2022-02-27 02:47:04 Downloading - Downloading input data...
2022-02-27 02:47:45 Training - Downloading the training image......
2022-02-27 02:48:45 Training - Training image download completed. Training in progress.[34m2022-02-27 02:48:42,035 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2022-02-27 02:48:42,039 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-02-27 02:48:42,052 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2022-02-27 02:48:42,432 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-02-27 02:48:42,455 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-02-27 02:48:42,483 sage

In [63]:
# Deploy my estimator to a SageMaker Endpoint and get a Predictor
predictor = sklearn_estimator.deploy(instance_type='ml.m4.xlarge',
                                     initial_instance_count=1)

# `data` is a NumPy array or a Python list.
# `response` is a NumPy array.
# response = predictor.predict(data)

------!

In [68]:
predictor.endpoint_name

'sagemaker-scikit-learn-2022-02-27-02-55-15-188'

In [65]:
# xgb_model.model_data

In [66]:
# xgb_predictor=xgb_model.deploy(
#     initial_instance_count=1,
#     instance_type='ml.t2.medium',
#     serializer=CSVSerializer()
# )

In [67]:
# xgb_predictor.endpoint_name