# 4. Model Training
*Note: If you are prompted to select a kernel, please select PyTorch 1.8 Python 3.6*

In this notebook we will train a model on our sample data set to detect failures.

You can select Run->Run All Cells from the menu to run all cells in Studio (or Cell->Run All in a SageMaker Notebook Instance).

This solution relies on a config file to run the provisioned AWS resources. Run the cell below to generate that file.

In [2]:
import boto3
import os
import json

client = boto3.client('servicecatalog')
cwd = os.getcwd().split('/')
i= cwd.index('S3Downloads')
pp_name = cwd[i + 1]
pp = client.describe_provisioned_product(Name=pp_name)
record_id = pp['ProvisionedProductDetail']['LastSuccessfulProvisioningRecordId']
record = client.describe_record(Id=record_id)

keys = [ x['OutputKey'] for x in record['RecordOutputs'] if 'OutputKey' and 'OutputValue' in x]
values = [ x['OutputValue'] for x in record['RecordOutputs'] if 'OutputKey' and 'OutputValue' in x]
stack_output = dict(zip(keys, values))

with open(f'/root/S3Downloads/{pp_name}/stack_outputs.json', 'w') as f:
    json.dump(stack_output, f)

In [3]:
import os
import json
import boto3
import sagemaker
import numpy as np

from sagemaker.pytorch import PyTorch
from sagemaker.s3 import S3Uploader
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

from source.config import Config
config = Config(filename="config/config.yaml")

with open("stack_outputs.json") as f:
    sagemaker_configs = json.load(f)

## Training Configuration

In [4]:
sage_session = sagemaker.session.Session()
s3_bucket = sagemaker_configs["S3Bucket"]  
s3_output_path = 's3://{}/'.format(s3_bucket)
print("S3 bucket path: {}".format(s3_output_path))

# run in local_mode on this machine, or as a SageMaker TrainingJob
local_mode = False

if local_mode:
    instance_type = 'local'
else:
    instance_type = sagemaker_configs["SageMakerTrainingInstanceType"]
    
role = sagemaker.get_execution_role()
print("Using IAM role arn: {}".format(role))
# only run from SageMaker notebook instance
if local_mode:
    !/bin/bash ./setup.sh
cpu_or_gpu = 'gpu' if instance_type.startswith('ml.p') else 'cpu'

S3 bucket path: s3://sagemaker-soln-fpm-js-k96rp4-sagemaker-soln-fpm/
Using IAM role arn: arn:aws:iam::777719004897:role/service-role/AmazonSageMaker-ExecutionRole-20220927T143324


In [5]:
# Create a descriptive job name 
job_name_prefix = 'sagemaker-soln-fpm'

In [6]:
metric_definitions = [
    {'Name': 'Epoch', 'Regex': 'Epoch: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},
    {'Name': 'train_loss', 'Regex': 'Train loss: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},
    {'Name': 'train_acc',  'Regex': 'Train acc: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},
    {'Name': 'train_auc',  'Regex': 'Train auc: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},
    {'Name': 'test_loss', 'Regex': 'Test loss: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},
    {'Name': 'test_acc', 'Regex': 'Test acc: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},
    {'Name': 'test_auc', 'Regex': 'Test auc: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},
]

## Define your data

In [7]:
print("Using dataset {}".format(config.train_dataset_fn))

Using dataset data/processed/train_dataset.csv


In [8]:

key_prefix='fpm-data'
training_data = S3Uploader.upload(config.train_dataset_fn, 's3://{}/{}'.format(s3_bucket, key_prefix))
testing_data = S3Uploader.upload(config.test_dataset_fn, 's3://{}/{}'.format(s3_bucket, key_prefix))

print("Training data: {}".format(training_data))
print("Testing data: {}".format(testing_data))

Training data: s3://sagemaker-soln-fpm-js-k96rp4-sagemaker-soln-fpm/fpm-data/train_dataset.csv
Testing data: s3://sagemaker-soln-fpm-js-k96rp4-sagemaker-soln-fpm/fpm-data/test_dataset.csv


## Hyperparameter Optimization
We will use SageMaker Hyperparameter Tuning to choose the best hyperparameters for the data set.

In [9]:
max_jobs = 4
max_parallel_jobs = 2

In [10]:
hyperparameter_ranges = {
    'lr': ContinuousParameter(1e-5, 1e-2),
    'batch_size': IntegerParameter(100, 256),
    'dropout': ContinuousParameter(0.0, 0.8),
    
    'fc_hidden_units': CategoricalParameter(["[256, 128]", "[256, 128, 128]", "[256, 256, 128]", "[256, 128, 64]"]),
    'conv_channels': CategoricalParameter(["[2, 8, 2]", "[2, 16, 2]", "[2, 16, 16, 2]"]),
}

In [11]:
%%time

estimator = PyTorch(entry_point="train.py",
                    source_dir='source',                    
                    role=role,
                    dependencies=["source/dl_utils"],
                    instance_type=instance_type,
                    instance_count=1,
                    output_path=s3_output_path,
                    framework_version="1.5.0",
                    py_version='py3',
                    base_job_name=job_name_prefix,
                    metric_definitions=metric_definitions,
                    hyperparameters= {
                        'epoch': 100,  # tune it according to your need
                        'target_column': config.target_column,
                        'sensor_headers': json.dumps(config.sensor_headers),
                        'train_input_filename': os.path.basename(config.train_dataset_fn),
                        'test_input_filename': os.path.basename(config.test_dataset_fn),
                        }
                     )

if local_mode:
    estimator.fit({'train': training_data, 'test': testing_data})

CPU times: user 18.1 ms, sys: 305 µs, total: 18.4 ms
Wall time: 19.6 ms


In [14]:
%%time

tuner = HyperparameterTuner(estimator,
                            objective_metric_name='test_auc',
                            objective_type='Maximize',
                            hyperparameter_ranges=hyperparameter_ranges,
                            metric_definitions=metric_definitions,
                            max_jobs=max_jobs,
                            max_parallel_jobs=max_parallel_jobs,
                            base_tuning_job_name=job_name_prefix)
tuner.fit({'train': training_data, 'test': testing_data})

................................................................................................!
CPU times: user 454 ms, sys: 78.4 ms, total: 533 ms
Wall time: 8min 5s


### Please note that the hyperparameter tuning job takes 3 to 4 hours to complete.

## Next Stage
Up next we will analyze the results. [Click here to continue](./5_results_analysis.ipynb).