In [1]:
import sagemaker
from sagemaker.pytorch import PyTorch
import boto3
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import os
import sys
import time
import json
from IPython.display import display
from time import strftime, gmtime
from sagemaker.inputs import TrainingInput
from sagemaker.serializers import CSVSerializer
from sklearn import preprocessing

In [3]:
s3_client = boto3.client("s3")
sess = sagemaker.session.Session()
role = sagemaker.get_execution_role()
bucket = sess.default_bucket()
prefix = "gscaltex-data"

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


In [4]:
def create_experiment(experiment_name):
    try:
        sm_experiment = Experiment.load(experiment_name)
    except:
        sm_experiment = Experiment.create(experiment_name=experiment_name)

In [5]:
def create_trial(experiment_name):
    from time import strftime
    create_date = strftime("%m%d-%H%M%s")
    sm_trial = Trial.create(trial_name=f'{experiment_name}-{create_date}',
                            experiment_name=experiment_name)

    job_name = f'{sm_trial.trial_name}'
    return job_name

In [6]:
s3 = boto3.client("s3")
s3.download_file(f"sagemaker-sample-files", "datasets/tabular/synthetic/churn.txt", "./data/churn.txt")

In [7]:
churn = pd.read_csv("./data/churn.txt")
pd.set_option("display.max_columns", 500)
churn.head(5)

Unnamed: 0,State,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,Eve Mins,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
0,PA,163,806,403-2562,no,yes,300,8.162204,3,7.579174,3.933035,4,6.508639,4.065759,100,5.111624,4.92816,6,5.673203,3,True.
1,SC,15,836,158-8416,yes,no,0,10.018993,4,4.226289,2.325005,0,9.972592,7.14104,200,6.436188,3.221748,6,2.559749,8,False.
2,MO,131,777,896-6253,no,yes,300,4.70849,3,4.76816,4.537466,3,4.566715,5.363235,100,5.142451,7.139023,2,6.254157,4,False.
3,WY,75,878,817-5729,yes,yes,700,1.268734,3,2.567642,2.528748,5,2.333624,3.773586,450,3.814413,2.245779,6,1.080692,6,False.
4,WY,146,878,450-4942,yes,no,0,2.696177,3,5.908916,6.015337,3,3.670408,3.751673,250,2.796812,6.905545,4,7.134343,6,True.


By modern standards, it’s a relatively small dataset, with only 5,000 records, where each record uses 21 attributes to describe the profile of a customer of an unknown US mobile operator. The attributes are:

`State`: the US state in which the customer resides, indicated by a two-letter abbreviation; for example, OH or NJ

`Account Length`: the number of days that this account has been active

`Area Code`: the three-digit area code of the corresponding customer’s phone number

`Phone`: the remaining seven-digit phone number

`Int’l Plan`: whether the customer has an international calling plan: yes/no

`VMail Plan`: whether the customer has a voice mail feature: yes/no

`VMail Message`: the average number of voice mail messages per month

`Day Mins`: the total number of calling minutes used during the day

`Day Calls`: the total number of calls placed during the day

`Day Charge`: the billed cost of daytime calls

`Eve Mins`, `Eve Calls`, `Eve Charge`: the billed cost for calls placed during the evening

`Night Mins`, `Night Calls`, `Night Charge`: the billed cost for calls placed during nighttime

`Intl Mins`, `Intl Calls`, `Intl Charge`: the billed cost for international calls

`CustServ Calls`: the number of calls placed to Customer Service

`Churn?`: whether the customer left the service: true/false

The last attribute, `Churn?`, is known as the target attribute: the attribute that we want the ML model to predict. Because the target attribute is binary, our model will be performing binary prediction, also known as binary classification.

Let’s begin exploring the data:

In [8]:
churn = churn.drop("Phone", axis=1)
churn["Area Code"] = churn["Area Code"].astype(object)

We convert the target attribute to binary value and move it to the first column of the dataset to meet requirements of SageMaker built-in tabular algorithms (For an example, see SageMaker LightGBM documentation).

In [9]:
churn["target"] = churn["Churn?"].map({"True.": 1, "False.": 0})
churn.drop(["Churn?"], axis=1, inplace=True)

In [10]:
churn = churn[["target"] + churn.columns.tolist()[:-1]]

In [11]:
cat_columns = [
    "State",
    "Account Length",
    "Area Code",
    "Phone",
    "Int'l Plan",
    "VMail Plan",
    "VMail Message",
    "Day Calls",
    "Eve Calls",
    "Night Calls",
    "Intl Calls",
    "CustServ Calls",
]

cat_idx = []
for idx, col_name in enumerate(churn.columns.tolist()):
    if col_name in cat_columns:
        cat_idx.append(idx)

In [12]:
with open("./data/cat_idx.json", "w") as outfile:
    json.dump({"cat_idx": cat_idx}, outfile)

In [13]:
for idx, col_name in enumerate(churn.columns.tolist()):
    if col_name in cat_columns:
        le = preprocessing.LabelEncoder()
        churn[col_name] = le.fit_transform(churn[col_name])

In [14]:
from sklearn.model_selection import train_test_split

train, val_n_test = train_test_split(
    churn, test_size=0.3, random_state=42, stratify=churn["target"]
)

In [15]:
val, test = train_test_split(
    val_n_test, test_size=0.3, random_state=42, stratify=val_n_test["target"]
)

In [16]:
train.to_csv("./data/train.csv", header=False, index=False)
val.to_csv("./data/validation.csv", header=False, index=False)
test.to_csv("./data/test.csv", header=False, index=False)

In [17]:
from tqdm import tqdm

for i in tqdm(range(200)):
    boto3.Session().resource("s3").Bucket(bucket).Object(
        os.path.join(prefix, f"train/data_{i}.csv")
    ).upload_file("./data/train.csv")
    
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "validation/data.csv")
).upload_file("./data/validation.csv")

boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "test/data.csv")
).upload_file("./data/test.csv")

boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "train/cat_idx.json")
).upload_file("./data/cat_idx.json")

  0%|          | 0/200 [00:00<?, ?it/s]INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
  0%|          | 1/200 [00:00<01:03,  3.14it/s]INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
  1%|          | 2/200 [00:00<00:58,  3.36it/s]INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
  2%|▏         | 3/200 [00:00<01:07,  2.93it/s]INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
  2%|▏         | 4/200 [00:01<01:01,  3.19it/s]INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
  2%|▎         | 5/200 [00:01<00:57,  3.36it/s]INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
  3%|▎         | 6/200 [00:01<00:54,  3.59it/s]INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
  4%|▎        

In [18]:
training_dataset_s3_path = f"s3://{bucket}/{prefix}/train"
validation_dataset_s3_path = f"s3://{bucket}/{prefix}/validation"

output_prefix = "jumpstart-example-tabular-training"
s3_output_location = f"s3://{bucket}/{output_prefix}/output_lgb"

In [21]:
train_model_id, train_model_version, train_scope = "lightgbm-classification-model", "*", "training"

In [22]:
from sagemaker import hyperparameters

# Retrieve the default hyper-parameters for fine-tuning the model
hyperparameters = hyperparameters.retrieve_default(
    model_id=train_model_id, model_version=train_model_version
)

# [Optional] Override default hyperparameters with custom values
hyperparameters["num_boost_round"] = "200"


hyperparameters["metric"] = "auc"
hyperparameters["tree_learner"] = "voting"  # use AllReduce method for distributed training

del hyperparameters[
    "early_stopping_rounds"
]  # current distributed training with early stopping has some issues. See https://github.com/microsoft/SynapseML/issues/728#issuecomment-1221599961
# thus it is disabled for distributed training.
print(hyperparameters)

{'num_boost_round': '200', 'metric': 'auc', 'learning_rate': '0.009', 'num_leaves': '67', 'feature_fraction': '0.74', 'bagging_fraction': '0.53', 'bagging_freq': '5', 'max_depth': '11', 'min_data_in_leaf': '26', 'max_delta_step': '0.0', 'lambda_l1': '0.0', 'lambda_l2': '0.0', 'boosting': 'gbdt', 'min_gain_to_split': '0.0', 'scale_pos_weight': '1.0', 'tree_learner': 'voting', 'feature_fraction_bynode': '1.0', 'is_unbalance': 'False', 'max_bin': '255', 'num_threads': '0', 'verbosity': '1', 'use_dask': 'False'}


### [optional] Train with Automatic Model Tuning  


Amazon SageMaker automatic model tuning, also known as hyperparameter tuning, finds the best version of a model by running many training jobs on your dataset using the algorithm and ranges of hyperparameters that you specify. It then chooses the hyperparameter values that result in a model that performs the best, as measured by a metric that you choose. We will use a HyperparameterTuner object to interact with Amazon SageMaker hyperparameter tuning APIs.

* Note. In this notebook, we set AMT budget (total tuning jobs) as 10 for each of the tabular algorithm except AutoGluon-Tabular. For [AutoGluon-Tabular](https://arxiv.org/abs/2003.06505), it succeeds by ensembling multiple models and stacking them in multiple layers.  

In [23]:
# from sagemaker.tuner import ContinuousParameter, IntegerParameter, HyperparameterTuner

# use_amt = True

# hyperparameter_ranges_lgb = {
#     "learning_rate": ContinuousParameter(1e-4, 1, scaling_type="Logarithmic"),
#     "num_boost_round": IntegerParameter(2, 30),
#     "num_leaves": IntegerParameter(10, 50),
#     "feature_fraction": ContinuousParameter(0.1, 1),
#     "bagging_fraction": ContinuousParameter(0.1, 1),
#     "bagging_freq": IntegerParameter(1, 10),
#     "max_depth": IntegerParameter(5, 30),
#     "min_data_in_leaf": IntegerParameter(5, 50),
# }
use_amt = False

In [24]:
from sagemaker import image_uris, model_uris, script_uris

# Retrieve the pre-trained model tarball to further fine-tune
train_model_uri = model_uris.retrieve(
    model_id=train_model_id, model_version=train_model_version, model_scope=train_scope
)
train_model_uri

's3://jumpstart-cache-prod-us-west-2/lightgbm-training/train-lightgbm-classification-model.tar.gz'

In [25]:
experiment_name = 'caltex-poc-1'

training_instance_type='local'
# training_instance_type='"ml.m5.4xlarge"'

instance_count = 1
use_spot_instances = False
max_wait = None
max_run = 1*60*60

In [26]:
from pathlib import Path
if training_instance_type in ['local_gpu', 'local']:
    from sagemaker.local import LocalSession
    

    sagemaker_session = LocalSession()
    sagemaker_session.config = {'local': {'local_code': True}}
    # s3_data_path = f'file://{Path.cwd()}/data'

else:
    sagemaker_session = sagemaker.Session()


source_dir = f'{Path.cwd()}/3.train_code_dask'

In [27]:
# Pytorch Image is used to enable distributed GPU training
estimator = PyTorch(
    source_dir=source_dir,
    entry_point="transfer_learning.py",
    model_uri=train_model_uri,
    role=role,
    sagemaker_session=sagemaker_session,
    instance_count=1, 
    instance_type=training_instance_type,
    framework_version="1.12.1",
    volume_size=512,
    py_version="py38",
    disable_profiler=True,
    use_spot_instances=use_spot_instances,
    max_wait=max_wait,
    max_run=max_run,
)

In [28]:
create_experiment(experiment_name)
job_name = create_trial(experiment_name)

if use_amt:

    tuner = HyperparameterTuner(
        estimator,
        "auc",
        hyperparameter_ranges_lgb,
        [{"Name": "auc", "Regex": "auc: ([0-9\\.]+)"}],
        max_jobs=21,
        max_parallel_jobs=3,
        objective_type="Maximize",
    )

    tuner.fit(
        inputs={
            "train": training_dataset_s3_path,
            "validation": validation_dataset_s3_path,
        },
        logs=False,
        job_name=job_name,
        experiment_config={
          'TrialName': job_name,
          'TrialComponentDisplayName': job_name,
        },
    )
else:
    # Launch a SageMaker Training job by passing s3 path of the training data
    estimator.fit(
        inputs={
            "train": training_dataset_s3_path,
            "validation": validation_dataset_s3_path,
        },
        logs=False,
        job_name=job_name,
        experiment_config={
          'TrialName': job_name,
          'TrialComponentDisplayName': job_name,
        },
    )

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: caltex-poc-1-0411-05471681192046
INFO:sagemaker.local.local_session:Starting training job
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:sagemaker.local.image:No AWS credentials found in session but credentials from EC2 Metadata Service are available.
INFO:sagemaker.local.image:docker compose file: 
networks:
  sagemaker-local:
    name: sagemaker-local
services:
  algo-1-okvzo:
    command: train
    container_name: bkevdosupr-algo-1-okvzo
    environment:
    - '[Masked]'
    - '[Masked]'
    

Creating bkevdosupr-algo-1-okvzo ... 
Creating bkevdosupr-algo-1-okvzo ... done
Attaching to bkevdosupr-algo-1-okvzo
[36mbkevdosupr-algo-1-okvzo |[0m 2023-04-11 05:47:48,641 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
[36mbkevdosupr-algo-1-okvzo |[0m 2023-04-11 05:47:48,643 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
[36mbkevdosupr-algo-1-okvzo |[0m 2023-04-11 05:47:48,645 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
[36mbkevdosupr-algo-1-okvzo |[0m 2023-04-11 05:47:48,655 sagemaker-training-toolkit INFO     instance_groups entry not present in resource_config
[36mbkevdosupr-algo-1-okvzo |[0m 2023-04-11 05:47:48,658 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
[36mbkevdosupr-algo-1-okvzo |[0m 2023-04-11 05:47:48,661 sagemaker_pytorch_container.training INFO     Invoking user training script.
[36mbkevdo

INFO:root:creating /tmp/tmp7ixqpxe5/artifacts/output/data
INFO:root:copying /tmp/tmp7ixqpxe5/algo-1-okvzo/output/success -> /tmp/tmp7ixqpxe5/artifacts/output
INFO:root:copying /tmp/tmp7ixqpxe5/model/model.pkl -> /tmp/tmp7ixqpxe5/artifacts/model
INFO:root:copying /tmp/tmp7ixqpxe5/model/__models_info__.json -> /tmp/tmp7ixqpxe5/artifacts/model


[36mbkevdosupr-algo-1-okvzo exited with code 0
[0mAborting on container exit...
===== Job Complete =====
.


In [None]:
estimator.logs()