## Autopilot Demo
This notebook is heavily inspired by [this notebook](https://github.com/awslabs/amazon-sagemaker-examples/blob/master/autopilot/sagemaker_autopilot_direct_marketing.ipynb) by awslabs, and its purpose is to demonstrate the power of AutoML on AWS: **SageMaker Autopilot**.



In [1]:
# 1 Getting data
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip
!unzip -o bank-additional.zip
!mv ./bank-additional/bank-additional-full.csv ./bank-additional-full.csv
!rm bank-additional.zip
!rm -rf ./__MACOSX
!rm -rf ./bank-additional

local_data_path = './bank-additional-full.csv'



--2019-12-23 13:25:04--  https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 444572 (434K) [application/x-httpd-php]
Saving to: ‘bank-additional.zip’


2019-12-23 13:25:06 (693 KB/s) - ‘bank-additional.zip’ saved [444572/444572]

Archive:  bank-additional.zip
   creating: bank-additional/
  inflating: bank-additional/.DS_Store  
   creating: __MACOSX/
   creating: __MACOSX/bank-additional/
  inflating: __MACOSX/bank-additional/._.DS_Store  
  inflating: bank-additional/.Rhistory  
  inflating: bank-additional/bank-additional-full.csv  
  inflating: bank-additional/bank-additional-names.txt  
  inflating: bank-additional/bank-additional.csv  
  inflating: __MACOSX/._bank-additional  


In [2]:
# 2 AWS Imports and basic settings
import sagemaker
import boto3
from sagemaker import get_execution_role

region = boto3.Session().region_name

session = sagemaker.Session()
bucket = session.default_bucket()
prefix = 'sagemaker/autopilot-dm'

role = get_execution_role()
sm = boto3.Session().client(service_name='sagemaker',region_name=region)



In [3]:
import pandas as pd

# 3 Check out dataset
data = pd.read_csv('bank-additional-full.csv', sep=';')
data.head()



Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [4]:
# 4 Splitting test data
train_data = data.sample(frac=0.8, random_state=42)
test_data = data.drop(train_data.index)
test_data_no_target = test_data.drop(columns=['y'])



In [5]:
# 5 Upload to S3
train_file = 'train_data.csv';
train_data.to_csv(train_file, index=False, header=True)
train_data_s3_path = session.upload_data(path=train_file, key_prefix=prefix + "/train")
print('Train data uploaded to: ' + train_data_s3_path)

test_file = 'test_data.csv';
test_data_no_target.to_csv(test_file, index=False, header=False)
test_data_s3_path = session.upload_data(path=test_file, key_prefix=prefix + "/test")
print('Test data uploaded to: ' + test_data_s3_path)



Train data uploaded to: s3://sagemaker-eu-west-1-202382387695/sagemaker/autopilot-dm/train/train_data.csv
Test data uploaded to: s3://sagemaker-eu-west-1-202382387695/sagemaker/autopilot-dm/test/test_data.csv


In [6]:
# 6 Setting input/output of autopilot
input_data_config = [{
      'DataSource': {
        'S3DataSource': {
          'S3DataType': 'S3Prefix',
          'S3Uri': 's3://{}/{}/train'.format(bucket,prefix)
        }
      },
      'TargetAttributeName': 'y'
    }
  ]

output_data_config = {
    'S3OutputPath': 's3://{}/{}/output'.format(bucket,prefix)
  }



In [7]:
from time import gmtime, strftime, sleep

timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())

# 7 Running the AutoML job
auto_ml_job_name = 'automl-banking-' + timestamp_suffix
print('AutoMLJobName: ' + auto_ml_job_name)

sm.create_auto_ml_job(AutoMLJobName=auto_ml_job_name,
                      InputDataConfig=input_data_config,
                      OutputDataConfig=output_data_config,
                      RoleArn=role)



AutoMLJobName: automl-banking-23-13-25-09


{'AutoMLJobArn': 'arn:aws:sagemaker:eu-west-1:202382387695:automl-job/automl-banking-23-13-25-09',
 'ResponseMetadata': {'RequestId': 'fd1721f3-c166-4d5a-966b-b8860dcec96c',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'fd1721f3-c166-4d5a-966b-b8860dcec96c',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '97',
   'date': 'Mon, 23 Dec 2019 13:25:09 GMT'},
  'RetryAttempts': 0}}

In [8]:
# 8 Check AutoML status

print ('JobStatus - Secondary Status')
print('------------------------------')

describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
print (describe_response['AutoMLJobStatus'] + " - " + describe_response['AutoMLJobSecondaryStatus'])
job_run_status = describe_response['AutoMLJobStatus']
    
while job_run_status not in ('Failed', 'Completed', 'Stopped'):
    describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
    job_run_status = describe_response['AutoMLJobStatus']
    
    print (describe_response['AutoMLJobStatus'] + " - " + describe_response['AutoMLJobSecondaryStatus'])
    sleep(60)



JobStatus - Secondary Status
------------------------------
InProgress - Starting
InProgress - Starting
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - ModelTuning
InProgress - ModelTuning
InProgress - ModelTuning
InProgress - ModelTuning
InProgress - ModelTuning
InProgress - ModelTuning
InProgress - ModelTuning
InProgress - ModelTuning
InProgress - ModelTuning
InProgress - ModelTuning
InProgres

In [9]:
# 9 Getting best model candidate
best_candidate = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)['BestCandidate']
best_candidate_name = best_candidate['CandidateName']
print(best_candidate)
print('\n')
print("CandidateName: " + best_candidate_name)
print("FinalAutoMLJobObjectiveMetricName: " + best_candidate['FinalAutoMLJobObjectiveMetric']['MetricName'])
print("FinalAutoMLJobObjectiveMetricValue: " + str(best_candidate['FinalAutoMLJobObjectiveMetric']['Value']))



{'CandidateName': 'tuning-job-1-b11494df53b345cf88-065-b6b73b39', 'FinalAutoMLJobObjectiveMetric': {'MetricName': 'validation:accuracy', 'Value': 0.9153519868850708}, 'ObjectiveStatus': 'Succeeded', 'CandidateSteps': [{'CandidateStepType': 'AWS::SageMaker::ProcessingJob', 'CandidateStepArn': 'arn:aws:sagemaker:eu-west-1:202382387695:processing-job/db-1-f1ee390b02ad401c885b4602652a44e87ebca7e2b12f4d6d9078bd8016', 'CandidateStepName': 'db-1-f1ee390b02ad401c885b4602652a44e87ebca7e2b12f4d6d9078bd8016'}, {'CandidateStepType': 'AWS::SageMaker::TrainingJob', 'CandidateStepArn': 'arn:aws:sagemaker:eu-west-1:202382387695:training-job/automl-ban-dpp7-1-21b3727961ac4fea9df8c71e3011117e6feaba1daec84', 'CandidateStepName': 'automl-ban-dpp7-1-21b3727961ac4fea9df8c71e3011117e6feaba1daec84'}, {'CandidateStepType': 'AWS::SageMaker::TransformJob', 'CandidateStepArn': 'arn:aws:sagemaker:eu-west-1:202382387695:transform-job/automl-ban-dpp7-rpb-1-f19bea1d6076401f83b4c4134cc647d8ace1db31e', 'CandidateStepNa

In [10]:
# 10 Check other model candidates
candidates = sm.list_candidates_for_auto_ml_job(AutoMLJobName=auto_ml_job_name, SortBy='FinalObjectiveMetricValue')['Candidates']
index = 1
for candidate in candidates:
    print (str(index) + "  " + candidate['CandidateName'] + "  " + str(candidate['FinalAutoMLJobObjectiveMetric']['Value']))
    index += 1



1  tuning-job-1-b11494df53b345cf88-065-b6b73b39  0.9153519868850708
2  tuning-job-1-b11494df53b345cf88-056-c6f4d9e6  0.9150490164756775
3  tuning-job-1-b11494df53b345cf88-126-2b895428  0.9150490164756775
4  tuning-job-1-b11494df53b345cf88-197-a7a1458f  0.9148970246315002
5  tuning-job-1-b11494df53b345cf88-237-eae48a16  0.9148970246315002
6  tuning-job-1-b11494df53b345cf88-182-6e8ce427  0.9147449731826782
7  tuning-job-1-b11494df53b345cf88-155-cc5df8c8  0.9144420027732849
8  tuning-job-1-b11494df53b345cf88-046-1da0cd87  0.9144420027732849
9  tuning-job-1-b11494df53b345cf88-047-ab362bd4  0.9141380190849304
10  tuning-job-1-b11494df53b345cf88-232-a5d0fb3b  0.9141380190849304


In [11]:
# 11 Creating model endpoint
model_name = 'automl-banking-model-' + timestamp_suffix

model = sm.create_model(Containers=best_candidate['InferenceContainers'],
                            ModelName=model_name,
                            ExecutionRoleArn=role)

print('Model ARN corresponding to the best candidate is : {}'.format(model['ModelArn']))



Model ARN corresponding to the best candidate is : arn:aws:sagemaker:eu-west-1:202382387695:model/automl-banking-model-23-13-25-09


In [12]:
# 12 Creating a batch transform job on the test set
transform_job_name = 'automl-banking-transform-' + timestamp_suffix
transform_input = {
        'DataSource': {
            'S3DataSource': {
                'S3DataType': 'S3Prefix',
                'S3Uri': test_data_s3_path
            }
        },
        'ContentType': 'text/csv',
        'CompressionType': 'None',
        'SplitType': 'Line'
    }
transform_output = {
        'S3OutputPath': 's3://{}/{}/inference-results'.format(bucket,prefix),
    }
transform_resources = {
        'InstanceType': 'ml.m5.4xlarge',
        'InstanceCount': 1
    }

sm.create_transform_job(TransformJobName = transform_job_name,
                        ModelName = model_name,
                        TransformInput = transform_input,
                        TransformOutput = transform_output,
                        TransformResources = transform_resources)



{'TransformJobArn': 'arn:aws:sagemaker:eu-west-1:202382387695:transform-job/automl-banking-transform-23-13-25-09',
 'ResponseMetadata': {'RequestId': 'dc68a154-c7d7-48e2-9e7d-a6605ad88b61',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'dc68a154-c7d7-48e2-9e7d-a6605ad88b61',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '113',
   'date': 'Mon, 23 Dec 2019 15:23:26 GMT'},
  'RetryAttempts': 0}}

In [13]:
# 13 Run inference on test set
print ('JobStatus')
print('----------')

describe_response = sm.describe_transform_job(TransformJobName = transform_job_name)
job_run_status = describe_response['TransformJobStatus']
print (job_run_status)

while job_run_status not in ('Failed', 'Completed', 'Stopped'):
    describe_response = sm.describe_transform_job(TransformJobName = transform_job_name)
    job_run_status = describe_response['TransformJobStatus']
    print (job_run_status)
    sleep(30)



JobStatus
----------
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
Completed


In [14]:
# 14 Data Exploration notebook
sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)['AutoMLJobArtifacts']['DataExplorationNotebookLocation']



's3://sagemaker-eu-west-1-202382387695/sagemaker/autopilot-dm/output/automl-banking-23-13-25-09/sagemaker-automl-candidates/pr-1-0cf64502546e4669a2967b5780cc4a8e76e0473b8b71413983f86fc571/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb'

In [15]:
# 15 Model Candidate notebook
sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)['AutoMLJobArtifacts']['CandidateDefinitionNotebookLocation']



's3://sagemaker-eu-west-1-202382387695/sagemaker/autopilot-dm/output/automl-banking-23-13-25-09/sagemaker-automl-candidates/pr-1-0cf64502546e4669a2967b5780cc4a8e76e0473b8b71413983f86fc571/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb'