In [36]:
!apt-get install unzip
!wget http://dataminingconsultant.com/DKD2e_data_sets.zip
!unzip -o DKD2e_data_sets.zip

Reading package lists... Done
Building dependency tree       
Reading state information... Done
unzip is already the newest version (6.0-23+deb10u1).
0 upgraded, 0 newly installed, 0 to remove and 15 not upgraded.
--2020-08-17 11:31:41--  http://dataminingconsultant.com/DKD2e_data_sets.zip
Resolving dataminingconsultant.com (dataminingconsultant.com)... 160.153.91.162
Connecting to dataminingconsultant.com (dataminingconsultant.com)|160.153.91.162|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1003616 (980K) [application/zip]
Saving to: ‘DKD2e_data_sets.zip.1’


2020-08-17 11:31:42 (2.57 MB/s) - ‘DKD2e_data_sets.zip.1’ saved [1003616/1003616]

Archive:  DKD2e_data_sets.zip
 extracting: Data sets/adult.zip     
  inflating: Data sets/cars.txt      
  inflating: Data sets/cars2.txt     
  inflating: Data sets/cereals.CSV   
  inflating: Data sets/churn.txt     
  inflating: Data sets/ClassifyRisk  
  inflating: Data sets/ClassifyRisk - Missing.txt  
 extracting:

In [3]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

In [37]:
df = pd.read_csv('Data sets/churn.txt')
df.head()

Unnamed: 0,State,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,...,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False.
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False.
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False.
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False.
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False.


# create training and testing data set

In [38]:
train_data = df.sample(frac=0.8,random_state=200)
test_data = df.drop(train_data.index)
test_data_no_target = test_data.drop(columns=['Churn?'])

In [39]:
print('total:{} training:{} testing:{}'.format(df.shape[0], train_data.shape[0], test_data.shape[0]))

total:3333 training:2666 testing:667


# Upload data to s3

In [40]:
import sagemaker
import boto3
from sagemaker import get_execution_role

region = boto3.Session().region_name

session = sagemaker.Session()
bucket = session.default_bucket()
prefix = 'sagemaker/autopilot-churn-prediction'

role = get_execution_role()

sm = boto3.Session().client(service_name='sagemaker',region_name=region)

In [41]:
train_data.to_csv('automl-train.csv', index=False, header=True) # Make sure features are comma-separated

In [42]:
train_data_url = sess.upload_data(path='automl-train.csv', bucket=bucket, key_prefix=prefix + '/input')
train_data_url

's3://sagemaker-us-east-1-230755935769/sagemaker/autopilot-churn-prediction/input/automl-train.csv'

In [43]:
test_data.to_csv('test_data.csv', index=False, header=True)
test_data_no_target.to_csv('test_data_no_target.csv', index=False, header=True)

In [44]:
test_data_no_target_url = sess.upload_data(path='test_data_no_target.csv', bucket=bucket, key_prefix=prefix + '/test')
test_data_no_target_url

's3://sagemaker-us-east-1-230755935769/sagemaker/autopilot-churn-prediction/test/test_data_no_target.csv'

In [45]:
# # verfiy the automl-train.csv
# automl_train = pd.read_csv('automl-train.csv')
# automl_train.head()

!aws s3 ls {train_data_url}

2020-08-17 11:33:01     250571 automl-train.csv


# Configure data location and artifacts

* Select target attribute "Churn?" to predict fraud or not fraud

In [50]:
input_data_config = [{
      'DataSource': {
        'S3DataSource': {
          'S3DataType': 'S3Prefix',
          'S3Uri': 's3://{}/{}/input'.format(bucket,prefix)
        }
      },
      'TargetAttributeName': 'Churn?'
    }
  ]

output_data_config = {
    'S3OutputPath': 's3://{}/{}/output'.format(bucket,prefix)
  }

input_data_config

[{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix',
    'S3Uri': 's3://sagemaker-us-east-1-230755935769/sagemaker/autopilot-churn-prediction/input'}},
  'TargetAttributeName': 'Churn?'}]

In [51]:
output_data_config

{'S3OutputPath': 's3://sagemaker-us-east-1-230755935769/sagemaker/autopilot-churn-prediction/output'}

In [52]:
automl_job_config = {
  "CompletionCriteria": {
    "MaxCandidates": 10,
    "MaxRuntimePerTrainingJobInSeconds": 10*60
#       "MaxAutoMLJobRuntimeInSeconds": 60*120
  }
}
automl_job_config

{'CompletionCriteria': {'MaxCandidates': 10,
  'MaxRuntimePerTrainingJobInSeconds': 600}}

# Create SageMaker Auto-pilot job

In [53]:
from time import gmtime, strftime, sleep
timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())

auto_ml_job_name = 'automl-churn-' + timestamp_suffix
print('AutoMLJobName: ' + auto_ml_job_name)

sm.create_auto_ml_job(AutoMLJobName=auto_ml_job_name,
                      InputDataConfig=input_data_config,
                      OutputDataConfig=output_data_config,
                      AutoMLJobConfig=automl_job_config,
                      RoleArn=role)

AutoMLJobName: automl-churn-17-11-37-46


{'AutoMLJobArn': 'arn:aws:sagemaker:us-east-1:230755935769:automl-job/automl-churn-17-11-37-46',
 'ResponseMetadata': {'RequestId': '7c5ab3f3-a762-4d45-8345-c5863e014af4',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '7c5ab3f3-a762-4d45-8345-c5863e014af4',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '95',
   'date': 'Mon, 17 Aug 2020 11:37:47 GMT'},
  'RetryAttempts': 0}}

# Track Autopolit Job Status

In [54]:
print ('JobStatus - Secondary Status')
print('------------------------------')


describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
print (describe_response['AutoMLJobStatus'] + " - " + describe_response['AutoMLJobSecondaryStatus'])
job_run_status = describe_response['AutoMLJobStatus']
    
while job_run_status not in ('Failed', 'Completed', 'Stopped'):
    describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
    job_run_status = describe_response['AutoMLJobStatus']
    job_secondary_status = describe_response['AutoMLJobSecondaryStatus']
    print (job_run_status + " - " + job_secondary_status)
    sleep(30)

JobStatus - Secondary Status
------------------------------
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineer

# Result

In [62]:
import pprint
import json
best_candidate = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)['BestCandidate']

best_candidate_name = best_candidate['CandidateName']
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(best_candidate)
print('\n')
print("CandidateName: " + best_candidate_name)
print("FinalAutoMLJobObjectiveMetricName: " + best_candidate['FinalAutoMLJobObjectiveMetric']['MetricName'])
print("FinalAutoMLJobObjectiveMetricValue: " + str(best_candidate['FinalAutoMLJobObjectiveMetric']['Value']))

{   'CandidateName': 'tuning-job-1-e334837d50ab42d798-007-93c2ad64',
    'CandidateStatus': 'Completed',
    'CandidateSteps': [   {   'CandidateStepArn': 'arn:aws:sagemaker:us-east-1:230755935769:processing-job/db-1-8eb9a16e02354b01bdecd9c98978afd8539f87fad8a144f29ca1343fa5',
                              'CandidateStepName': 'db-1-8eb9a16e02354b01bdecd9c98978afd8539f87fad8a144f29ca1343fa5',
                              'CandidateStepType': 'AWS::SageMaker::ProcessingJob'},
                          {   'CandidateStepArn': 'arn:aws:sagemaker:us-east-1:230755935769:training-job/automl-chu-dpp9-1-85a124cd0127498c9f9b8cddca7e5c4b8f389dbf1d7d4',
                              'CandidateStepName': 'automl-chu-dpp9-1-85a124cd0127498c9f9b8cddca7e5c4b8f389dbf1d7d4',
                              'CandidateStepType': 'AWS::SageMaker::TrainingJob'},
                          {   'CandidateStepArn': 'arn:aws:sagemaker:us-east-1:230755935769:transform-job/automl-chu-dpp9-rpb-1-60d6fa5088c3444dafc

# Create Model for best candidates

In [63]:
model_name = 'automl-churn-model-' + timestamp_suffix

model = sm.create_model(Containers=best_candidate['InferenceContainers'],
                            ModelName=model_name,
                            ExecutionRoleArn=role)

print('Model ARN corresponding to the best candidate is : {}'.format(model['ModelArn']))


Model ARN corresponding to the best candidate is : arn:aws:sagemaker:us-east-1:230755935769:model/automl-churn-model-17-11-37-46


# Use Transformation Job to test the model

In [64]:
timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())
transform_job_name = 'automl-churn-transform-' + timestamp_suffix

transform_input = {
        'DataSource': {
            'S3DataSource': {
                'S3DataType': 'S3Prefix',
                'S3Uri': test_data_no_target_url
            }
        },
        'ContentType': 'text/csv',
        'CompressionType': 'None',
        'SplitType': 'Line'
    }

transform_output = {
        'S3OutputPath': 's3://{}/{}/inference-results'.format(bucket,prefix),
    }

transform_resources = {
        'InstanceType': 'ml.m5.4xlarge',
        'InstanceCount': 1
    }

sm.create_transform_job(TransformJobName = transform_job_name,
                        ModelName = model_name,
                        TransformInput = transform_input,
                        TransformOutput = transform_output,
                        TransformResources = transform_resources
)

{'TransformJobArn': 'arn:aws:sagemaker:us-east-1:230755935769:transform-job/automl-churn-transform-17-13-00-10',
 'ResponseMetadata': {'RequestId': '31ad7623-55a6-4f9e-807a-36fcbe7d672c',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '31ad7623-55a6-4f9e-807a-36fcbe7d672c',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '111',
   'date': 'Mon, 17 Aug 2020 13:00:09 GMT'},
  'RetryAttempts': 0}}

# Get Batch Transform Job Status

In [65]:
print ('JobStatus')
print('----------')


describe_response = sm.describe_transform_job(TransformJobName = transform_job_name)
job_run_status = describe_response['TransformJobStatus']
print (job_run_status)

while job_run_status not in ('Failed', 'Completed', 'Stopped'):
    describe_response = sm.describe_transform_job(TransformJobName = transform_job_name)
    job_run_status = describe_response['TransformJobStatus']
    print (job_run_status)
    sleep(30)

JobStatus
----------
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
Completed


# View Results of Transform Job

In [66]:
s3_output_path = sm.describe_transform_job(TransformJobName = transform_job_name)['TransformOutput']['S3OutputPath']
s3_output_key = s3_output_path + '/{}'.format('test_data_no_target.csv.out')
s3_output_key

's3://sagemaker-us-east-1-230755935769/sagemaker/autopilot-churn-prediction/inference-results/test_data_no_target.csv.out'

In [67]:
local_inference_results_path = 'inference_results.csv'
!aws s3 cp {s3_output_key} {local_inference_results_path}

download: s3://sagemaker-us-east-1-230755935769/sagemaker/autopilot-churn-prediction/inference-results/test_data_no_target.csv.out to ./inference_results.csv


In [68]:
data = pd.read_csv(local_inference_results_path, sep=';')
pd.set_option('display.max_rows', 10)         # Keep the output on one page
data

Unnamed: 0,True.
0,False.
1,False.
2,True.
3,False.
4,False.
...,...
662,False.
663,False.
664,False.
665,False.


In [69]:
test_data.head()

Unnamed: 0,State,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,...,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
17,VT,93,510,386-2923,no,no,0,190.7,114,32.42,...,111,18.55,129.6,121,5.83,8.1,3,2.19,3,False.
18,VA,76,510,356-2992,no,yes,33,189.7,66,32.25,...,65,18.09,165.7,108,7.46,10.0,5,2.7,1,False.
48,ID,119,415,398-1294,no,no,0,159.1,114,27.05,...,117,19.66,143.2,91,6.44,8.8,3,2.38,5,True.
67,MN,113,408,365-9011,yes,no,0,125.2,93,21.28,...,119,17.54,129.3,139,5.82,8.3,8,2.24,0,False.
73,NV,90,415,399-4246,no,no,0,203.4,146,34.58,...,117,19.27,152.4,105,6.86,7.3,4,1.97,1,False.


In [77]:
df_test_data_label = test_data['Churn?']
test_labels = df_test_data_label
test_labels.size

667

In [78]:
df_preds = pd.read_csv(local_inference_results_path, sep=';')
test_preds = df_preds['True.']
test_preds.size

667

In [87]:
accuracy = (test_labels.reset_index()['Churn?'] == test_preds[0]).sum() / len(test_labels)
print('Accuracy: {}'.format(accuracy))

Accuracy: 0.8575712143928036


# View other candidates explored by SageMaker Autopilot

In [88]:
candidates = sm.list_candidates_for_auto_ml_job(AutoMLJobName=auto_ml_job_name, SortBy='FinalObjectiveMetricValue')['Candidates']
index = 1
for candidate in candidates:
  print (str(index) + "  " + candidate['CandidateName'] + "  " + str(candidate['FinalAutoMLJobObjectiveMetric']['Value']))
  index += 1

1  tuning-job-1-e334837d50ab42d798-008-cfee48f7  0.8895999789237976
2  tuning-job-1-e334837d50ab42d798-007-93c2ad64  0.8895999789237976
3  tuning-job-1-e334837d50ab42d798-005-b1a8c6d9  0.857230007648468
4  tuning-job-1-e334837d50ab42d798-004-bf9bf7ee  0.857230007648468
5  tuning-job-1-e334837d50ab42d798-003-82bf7cb1  0.857230007648468
6  tuning-job-1-e334837d50ab42d798-006-7ecb3ff1  0.857230007648468
7  tuning-job-1-e334837d50ab42d798-002-fd1912a8  0.8151900172233582
8  tuning-job-1-e334837d50ab42d798-009-6117571f  0.6704800128936768
9  tuning-job-1-e334837d50ab42d798-010-33f57275  0.6704800128936768
10  tuning-job-1-e334837d50ab42d798-001-09e5eb5d  0.24852071702480316


# Candidate Generation Notebook

In [89]:
nb_data_explore_url = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)['AutoMLJobArtifacts']['DataExplorationNotebookLocation']
nb_data_explore_url

's3://sagemaker-us-east-1-230755935769/sagemaker/autopilot-churn-prediction/output/automl-churn-17-11-37-46/sagemaker-automl-candidates/pr-1-014ae6baabfa4286abc61b57b638a70184a6077b8a9d4e3aa17cc97a15/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb'

In [90]:
nb_training_url = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)['AutoMLJobArtifacts']['CandidateDefinitionNotebookLocation']
nb_training_url

's3://sagemaker-us-east-1-230755935769/sagemaker/autopilot-churn-prediction/output/automl-churn-17-11-37-46/sagemaker-automl-candidates/pr-1-014ae6baabfa4286abc61b57b638a70184a6077b8a9d4e3aa17cc97a15/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb'

In [91]:
!aws s3 cp {nb_data_explore_url} ./data_explore.ipynb
!aws s3 cp {nb_training_url} ./autopilot_training.ipynb

download: s3://sagemaker-us-east-1-230755935769/sagemaker/autopilot-churn-prediction/output/automl-churn-17-11-37-46/sagemaker-automl-candidates/pr-1-014ae6baabfa4286abc61b57b638a70184a6077b8a9d4e3aa17cc97a15/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb to ./data_explore.ipynb
download: s3://sagemaker-us-east-1-230755935769/sagemaker/autopilot-churn-prediction/output/automl-churn-17-11-37-46/sagemaker-automl-candidates/pr-1-014ae6baabfa4286abc61b57b638a70184a6077b8a9d4e3aa17cc97a15/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb to ./autopilot_training.ipynb
