In [1]:
%%bash
#Download the sample data
apt-get install -y unzip

wget https://s3-us-west-2.amazonaws.com/sagemaker-e2e-solutions/fraud-detection/creditcardfraud.zip
unzip creditcardfraud.zip


Reading package lists...
Building dependency tree...
Reading state information...
unzip is already the newest version (6.0-23+deb10u1).
0 upgraded, 0 newly installed, 0 to remove and 15 not upgraded.
Archive:  creditcardfraud.zip


--2020-08-17 03:51:12--  https://s3-us-west-2.amazonaws.com/sagemaker-e2e-solutions/fraud-detection/creditcardfraud.zip
Resolving s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)... 52.218.229.104
Connecting to s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)|52.218.229.104|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 69155632 (66M) [application/zip]
Saving to: ‘creditcardfraud.zip.2’

     0K .......... .......... .......... .......... ..........  0%  823K 82s
    50K .......... .......... .......... .......... ..........  0%  691K 90s
   100K .......... .......... .......... .......... ..........  0%  704K 92s
   150K .......... .......... .......... .......... ..........  0% 56.1M 69s
   200K .......... .......... .......... .......... ..........  0% 35.9M 56s
   250K .......... .......... .......... .......... ..........  0%  717K 62s
   300K .......... .......... .......... .......... ..........  0% 89.1M 53s
   350K .......... ..........

CalledProcessError: Command 'b'#Download the sample data\napt-get install -y unzip\n\nwget https://s3-us-west-2.amazonaws.com/sagemaker-e2e-solutions/fraud-detection/creditcardfraud.zip\nunzip creditcardfraud.zip\n'' returned non-zero exit status 1.

In [3]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
df = pd.read_csv('creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


# create training and testing data set

In [5]:
train_data = df.sample(frac=0.8,random_state=200)
test_data = df.drop(train_data.index)
test_data_no_target = test_data.drop(columns=['Class'])

In [6]:
print('total:{} training:{} testing:{}'.format(df.shape[0], train_data.shape[0], test_data.shape[0]))

total:284807 training:227846 testing:56961


# Upload data to s3

In [9]:
import sagemaker
import boto3
from sagemaker import get_execution_role

region = boto3.Session().region_name

session = sagemaker.Session()
bucket = session.default_bucket()
# bucket = 'beyoung-fraud-detection-20200309'
prefix = 'sagemaker/autopilot-fraud-detection'

role = get_execution_role()

sm = boto3.Session().client(service_name='sagemaker',region_name=region)

In [10]:
train_data.to_csv('automl-train.csv', index=False, header=True) # Make sure features are comma-separated

In [11]:
train_data_url = sess.upload_data(path='automl-train.csv', bucket=bucket, key_prefix=prefix + '/input')
train_data_url

's3://sagemaker-us-east-1-230755935769/sagemaker/autopilot-fraud-detection/input/automl-train.csv'

In [12]:
test_data.to_csv('test_data.csv', index=False, header=True)
test_data_no_target.to_csv('test_data_no_target.csv', index=False, header=True)

In [13]:
test_data_no_target_url = sess.upload_data(path='test_data_no_target.csv', bucket=bucket, key_prefix=prefix + '/test')
test_data_no_target_url

's3://sagemaker-us-east-1-230755935769/sagemaker/autopilot-fraud-detection/test/test_data_no_target.csv'

In [14]:
# # verfiy the automl-train.csv
# automl_train = pd.read_csv('automl-train.csv')
# automl_train.head()

!aws s3 ls {train_data_url}

2020-08-17 03:55:08  124692809 automl-train.csv


# Configure data location and artifacts

* Select target attribute "Class" to predict fraud or not fraud

In [15]:
input_data_config = [{
      'DataSource': {
        'S3DataSource': {
          'S3DataType': 'S3Prefix',
          'S3Uri': 's3://{}/{}/input'.format(bucket,prefix)
        }
      },
      'TargetAttributeName': 'Class'
    }
  ]

output_data_config = {
    'S3OutputPath': 's3://{}/{}/output'.format(bucket,prefix)
  }

input_data_config

[{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix',
    'S3Uri': 's3://sagemaker-us-east-1-230755935769/sagemaker/autopilot-fraud-detection/input'}},
  'TargetAttributeName': 'Class'}]

In [16]:
output_data_config

{'S3OutputPath': 's3://sagemaker-us-east-1-230755935769/sagemaker/autopilot-fraud-detection/output'}

In [17]:
automl_job_config = {
  "CompletionCriteria": {
    "MaxCandidates": 10,
    "MaxRuntimePerTrainingJobInSeconds": 10*60
  }
}
automl_job_config

{'CompletionCriteria': {'MaxCandidates': 10,
  'MaxRuntimePerTrainingJobInSeconds': 600}}

# Create SageMaker Auto-pilot job

In [18]:
from time import gmtime, strftime, sleep
timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())

auto_ml_job_name = 'automl-fraud-' + timestamp_suffix
print('AutoMLJobName: ' + auto_ml_job_name)

sm.create_auto_ml_job(AutoMLJobName=auto_ml_job_name,
                      InputDataConfig=input_data_config,
                      OutputDataConfig=output_data_config,
                      AutoMLJobConfig=automl_job_config,
                      RoleArn=role)

AutoMLJobName: automl-fraud-17-03-56-10


{'AutoMLJobArn': 'arn:aws:sagemaker:us-east-1:230755935769:automl-job/automl-fraud-17-03-56-10',
 'ResponseMetadata': {'RequestId': '5b59f717-a5e0-425c-ad7a-8b4459d6b5ff',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '5b59f717-a5e0-425c-ad7a-8b4459d6b5ff',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '95',
   'date': 'Mon, 17 Aug 2020 03:56:11 GMT'},
  'RetryAttempts': 0}}

# Track Autopolit Job Status

In [20]:
print ('JobStatus - Secondary Status')
print('------------------------------')


describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
print (describe_response['AutoMLJobStatus'] + " - " + describe_response['AutoMLJobSecondaryStatus'])
job_run_status = describe_response['AutoMLJobStatus']
    
while job_run_status not in ('Failed', 'Completed', 'Stopped'):
    describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
    job_run_status = describe_response['AutoMLJobStatus']
    job_secondary_status = describe_response['AutoMLJobSecondaryStatus']
    if job_secondary_status != "AnalyzingData":
        break
    print (job_run_status + " - " + job_secondary_status)
    sleep(30)

JobStatus - Secondary Status
------------------------------
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData


# Result

In [22]:
best_candidate = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)['BestCandidate']
best_candidate_name = best_candidate['CandidateName']
print(best_candidate)
print('\n')
print("CandidateName: " + best_candidate_name)
print("FinalAutoMLJobObjectiveMetricName: " + best_candidate['FinalAutoMLJobObjectiveMetric']['MetricName'])
print("FinalAutoMLJobObjectiveMetricValue: " + str(best_candidate['FinalAutoMLJobObjectiveMetric']['Value']))

{'CandidateName': 'tuning-job-1-ee1e3e3282ef4db68b-005-66845352', 'FinalAutoMLJobObjectiveMetric': {'MetricName': 'validation:f1', 'Value': 0.944350004196167}, 'ObjectiveStatus': 'Succeeded', 'CandidateSteps': [{'CandidateStepType': 'AWS::SageMaker::ProcessingJob', 'CandidateStepArn': 'arn:aws:sagemaker:us-east-1:230755935769:processing-job/db-1-aa232b16bd3348ce87a0f9eef69c96385d4eda5b20db4c7da03532da53', 'CandidateStepName': 'db-1-aa232b16bd3348ce87a0f9eef69c96385d4eda5b20db4c7da03532da53'}, {'CandidateStepType': 'AWS::SageMaker::TrainingJob', 'CandidateStepArn': 'arn:aws:sagemaker:us-east-1:230755935769:training-job/automl-fra-dpp4-1-11f8fef86644497db4968a948a6b16bd47b25aebb3824', 'CandidateStepName': 'automl-fra-dpp4-1-11f8fef86644497db4968a948a6b16bd47b25aebb3824'}, {'CandidateStepType': 'AWS::SageMaker::TransformJob', 'CandidateStepArn': 'arn:aws:sagemaker:us-east-1:230755935769:transform-job/automl-fra-dpp4-csv-1-f32e272c1d1a4247ac8271b295712c6dd1ef95007', 'CandidateStepName': 'a

# Create Model for best candidates

In [23]:
model_name = 'automl-fraud-model-' + timestamp_suffix

model = sm.create_model(Containers=best_candidate['InferenceContainers'],
                            ModelName=model_name,
                            ExecutionRoleArn=role)

print('Model ARN corresponding to the best candidate is : {}'.format(model['ModelArn']))


Model ARN corresponding to the best candidate is : arn:aws:sagemaker:us-east-1:230755935769:model/automl-fraud-model-17-03-56-10


# Use Transformation Job to test the model

In [24]:
timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())
transform_job_name = 'automl-fraud-transform-' + timestamp_suffix

transform_input = {
        'DataSource': {
            'S3DataSource': {
                'S3DataType': 'S3Prefix',
                'S3Uri': test_data_no_target_url
            }
        },
        'ContentType': 'text/csv',
        'CompressionType': 'None',
        'SplitType': 'Line'
    }

transform_output = {
        'S3OutputPath': 's3://{}/{}/inference-results'.format(bucket,prefix),
    }

transform_resources = {
        'InstanceType': 'ml.m5.4xlarge',
        'InstanceCount': 1
    }

sm.create_transform_job(TransformJobName = transform_job_name,
                        ModelName = model_name,
                        TransformInput = transform_input,
                        TransformOutput = transform_output,
                        TransformResources = transform_resources
)

{'TransformJobArn': 'arn:aws:sagemaker:us-east-1:230755935769:transform-job/automl-fraud-transform-17-05-35-13',
 'ResponseMetadata': {'RequestId': '66bccd78-673d-49cb-ac2f-c0f035edbb16',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '66bccd78-673d-49cb-ac2f-c0f035edbb16',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '111',
   'date': 'Mon, 17 Aug 2020 05:35:12 GMT'},
  'RetryAttempts': 0}}

# Get Batch Transform Job Status

In [25]:
print ('JobStatus')
print('----------')


describe_response = sm.describe_transform_job(TransformJobName = transform_job_name)
job_run_status = describe_response['TransformJobStatus']
print (job_run_status)

while job_run_status not in ('Failed', 'Completed', 'Stopped'):
    describe_response = sm.describe_transform_job(TransformJobName = transform_job_name)
    job_run_status = describe_response['TransformJobStatus']
    print (job_run_status)
    sleep(30)

JobStatus
----------
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
Completed


# View Results of Transform Job

In [26]:
s3_output_path = sm.describe_transform_job(TransformJobName = transform_job_name)['TransformOutput']['S3OutputPath']
s3_output_key = s3_output_path + '/{}'.format('test_data_no_target.csv.out')
s3_output_key

's3://sagemaker-us-east-1-230755935769/sagemaker/autopilot-fraud-detection/inference-results/test_data_no_target.csv.out'

In [27]:
local_inference_results_path = 'inference_results.csv'
!aws s3 cp {s3_output_key} {local_inference_results_path}

download: s3://sagemaker-us-east-1-230755935769/sagemaker/autopilot-fraud-detection/inference-results/test_data_no_target.csv.out to ./inference_results.csv


In [28]:
data = pd.read_csv(local_inference_results_path, sep=';')
pd.set_option('display.max_rows', 10)         # Keep the output on one page
data

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
56956,0
56957,0
56958,0
56959,0


In [29]:
test_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
9,9.0,-0.338262,1.119593,1.044367,-0.222187,0.499361,-0.246761,0.651583,0.069539,-0.736727,...,-0.246914,-0.633753,-0.120794,-0.38505,-0.069733,0.094199,0.246219,0.083076,3.68,0
10,10.0,1.449044,-1.176339,0.91386,-1.375667,-1.971383,-0.629152,-1.423236,0.048456,-1.720408,...,-0.009302,0.313894,0.02774,0.500512,0.251367,-0.129478,0.04285,0.016253,7.8,0
13,11.0,1.069374,0.287722,0.828613,2.71252,-0.178398,0.337544,-0.096717,0.115982,-0.221083,...,-0.036876,0.074412,-0.071407,0.104744,0.548265,0.104094,0.021491,0.021293,27.5,0


In [30]:
df_test_data_label = test_data['Class']
test_labels = df_test_data_label.to_numpy()
test_labels.size

56961

In [31]:
df_preds = pd.read_csv(local_inference_results_path, sep=';')
test_preds = df_preds['1'].to_numpy()
test_preds.size

KeyError: '1'

In [32]:
# calculate true positives, false positives, true negatives, false negatives
tp = np.logical_and(test_labels, test_preds).sum()
fp = np.logical_and(1-test_labels, test_preds).sum()
tn = np.logical_and(1-test_labels, 1-test_preds).sum()
fn = np.logical_and(test_labels, 1-test_preds).sum()

# calculate binary classification metrics
recall = tp / (tp + fn)
precision = tp / (tp + fp)
accuracy = (tp + tn) / (tp + fp + tn + fn)
f1 = 2 * precision * recall / (precision + recall)

NameError: name 'test_preds' is not defined

In [34]:
print(pd.crosstab(test_labels, test_preds, rownames=['actuals'], colnames=['predictions']))
print("\n{:<11} {:.3f}".format('Recall:', recall))
print("{:<11} {:.3f}".format('Precision:', precision))
print("{:<11} {:.3f}".format('Accuracy:', accuracy))
print("{:<11} {:.3f}".format('F1:', f1))


predictions      0   1
actuals               
0            56861   2
1               21  77

Recall:     0.786
Precision:  0.975
Accuracy:   1.000
F1:         0.870


# View other candidates explored by SageMaker Autopilot

In [35]:
candidates = sm.list_candidates_for_auto_ml_job(AutoMLJobName=auto_ml_job_name, SortBy='FinalObjectiveMetricValue')['Candidates']
index = 1
for candidate in candidates:
  print (str(index) + "  " + candidate['CandidateName'] + "  " + str(candidate['FinalAutoMLJobObjectiveMetric']['Value']))
  index += 1

1  tuning-job-1-78efa5a262f347c495-007-93ca3b18  0.9995830059051514
2  tuning-job-1-78efa5a262f347c495-008-f6e4eba8  0.9994950294494629
3  tuning-job-1-78efa5a262f347c495-002-0b25f833  0.999472975730896
4  tuning-job-1-78efa5a262f347c495-010-22a3388c  0.9994509816169739
5  tuning-job-1-78efa5a262f347c495-003-798a6498  0.9994289875030518
6  tuning-job-1-78efa5a262f347c495-009-ea376fd4  0.9994289875030518
7  tuning-job-1-78efa5a262f347c495-004-f7b24353  0.9994289875030518
8  tuning-job-1-78efa5a262f347c495-001-325b98d2  0.9993640184402466
9  tuning-job-1-78efa5a262f347c495-006-c55321ec  0.9991222023963928
10  tuning-job-1-78efa5a262f347c495-005-6135c1b0  0.0017559999832883477


# Candidate Generation Notebook

In [36]:
nb_data_explore_url = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)['AutoMLJobArtifacts']['DataExplorationNotebookLocation']
nb_data_explore_url

's3://beyoung-app/sagemaker/autopilot-fraud-detection/output/automl-fraud-09-05-58-46/sagemaker-automl-candidates/pr-1-b45cfd3f099040d29c923c1edb8eb7cf0484aec7bc744481b9471a04bd/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb'

In [37]:
nb_training_url = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)['AutoMLJobArtifacts']['CandidateDefinitionNotebookLocation']
nb_training_url

's3://beyoung-app/sagemaker/autopilot-fraud-detection/output/automl-fraud-09-05-58-46/sagemaker-automl-candidates/pr-1-b45cfd3f099040d29c923c1edb8eb7cf0484aec7bc744481b9471a04bd/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb'

In [38]:
!aws s3 cp {nb_data_explore_url} ./data_explore.ipynb
!aws s3 cp {nb_training_url} ./autopilot_training.ipynb

download: s3://beyoung-app/sagemaker/autopilot-fraud-detection/output/automl-fraud-09-05-58-46/sagemaker-automl-candidates/pr-1-b45cfd3f099040d29c923c1edb8eb7cf0484aec7bc744481b9471a04bd/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb to ./data_explore.ipynb
download: s3://beyoung-app/sagemaker/autopilot-fraud-detection/output/automl-fraud-09-05-58-46/sagemaker-automl-candidates/pr-1-b45cfd3f099040d29c923c1edb8eb7cf0484aec7bc744481b9471a04bd/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb to ./autopilot_training.ipynb
