In [1]:
!bash setup.sh

import sagemaker
from sagemaker_graph_fraud_detection import config, container_build

role = config.role
sess = sagemaker.Session()

Defaulting to user installation because normal site-packages is not writeable
Obtaining file:///home/sagemaker-user/S3Downloads/jumpstart-prod-fdfn_kkyxi65e/sagemaker_graph_fraud_detection
Installing collected packages: sagemaker-graph-fraud-detection
  Running setup.py develop for sagemaker-graph-fraud-detection
Successfully installed sagemaker-graph-fraud-detection


## Data Preprocessing and Feature Engineering

In [2]:
# Replace with an S3 location or local path to point to your own dataset
raw_data_location = 's3://{}/{}/data'.format(config.solution_upstream_bucket, config.solution_name)

session_prefix = 'dgl-fraud-detection'
input_data = 's3://{}/{}/{}'.format(config.solution_bucket, session_prefix, config.s3_data_prefix)

!aws s3 cp --recursive $raw_data_location $input_data

# Set S3 locations to store processed data for training and post-training results and artifacts respectively
train_data = 's3://{}/{}/{}'.format(config.solution_bucket, session_prefix, config.s3_processing_output)
train_output = 's3://{}/{}/{}'.format(config.solution_bucket, session_prefix, config.s3_train_output)

copy: s3://sagemaker-solutions-prod-us-east-1/Fraud-detection-in-financial-networks/data/transaction.csv to s3://sagemaker-soln-gfd-js-kkyxi65e-243108169697-us-east-1/dgl-fraud-detection/raw-data/transaction.csv
copy: s3://sagemaker-solutions-prod-us-east-1/Fraud-detection-in-financial-networks/data/identity.csv to s3://sagemaker-soln-gfd-js-kkyxi65e-243108169697-us-east-1/dgl-fraud-detection/raw-data/identity.csv


### Build container for Preprocessing and Feature Engineering


In [3]:
!pygmentize data-preprocessing/container/Dockerfile

[34mFROM[39;49;00m [33mpython:3.7-slim-buster[39;49;00m

[34mRUN[39;49;00m pip3 install [31mpandas[39;49;00m==[34m0[39;49;00m.24.2
[34mENV[39;49;00m [31mPYTHONUNBUFFERED[39;49;00m=TRUE

[34mENTRYPOINT[39;49;00m [[33m"python3"[39;49;00m]


In [4]:
region = config.region_name
account_id = config.aws_account
ecr_repository = config.ecr_repository

if config.container_build_project == "local":
    !cd  data-preprocessing && bash container/build_and_push.sh $ecr_repository $region $account_id
else:
    container_build.build(config.container_build_project)
ecr_repository_uri = '{}.dkr.ecr.{}.amazonaws.com/{}:latest'.format(account_id, region, ecr_repository)

Starting a build job for CodeBuild project: sagemaker-soln-gfd-js-kkyxi65e-processing-job-container-build
...[Container] 2021/02/10 04:27:59 Waiting for agent ping

[Container] 2021/02/10 04:28:01 Waiting for DOWNLOAD_SOURCE
[Container] 2021/02/10 04:28:02 Phase is DOWNLOAD_SOURCE
[Container] 2021/02/10 04:28:02 CODEBUILD_SRC_DIR=/codebuild/output/src186717467/src
[Container] 2021/02/10 04:28:02 YAML location is /codebuild/output/src186717467/src/buildspec.yml
[Container] 2021/02/10 04:28:02 Processing environment variables
[Container] 2021/02/10 04:28:02 No runtime version selected in buildspec.
[Container] 2021/02/10 04:28:02 Moving to directory /codebuild/output/src186717467/src
[Container] 2021/02/10 04:28:02 Registering with agent
[Container] 2021/02/10 04:28:02 Phases found in YAML: 2
[Container] 2021/02/10 04:28:02  BUILD: 3 commands
[Container] 2021/02/10 04:28:02  POST_BUILD: 1 commands
[Container] 2021/02/10 04:28:02 Phase complete: DOWNLOAD_SOURCE State: SUCCEEDED
[Container

### Run Preprocessing job with Amazon SageMaker Processing


In [5]:
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput

script_processor = ScriptProcessor(command=['python3'],
                                   image_uri=ecr_repository_uri,
                                   role=role,
                                   instance_count=1,
                                   instance_type='ml.m5.xlarge')

script_processor.run(code='data-preprocessing/graph_data_preprocessor.py',
                     inputs=[ProcessingInput(source=input_data,
                                             destination='/opt/ml/processing/input')],
                     outputs=[ProcessingOutput(destination=train_data,
                                               source='/opt/ml/processing/output')],
                     arguments=['--id-cols', 'card_no,card_type,email_domain',
                                '--cat-cols','ProductCD'])

Parameter 'session' will be renamed to 'sagemaker_session' in SageMaker Python SDK v2.



Job Name:  sagemaker-soln-graph-fraud-preprocessin-2021-02-10-04-29-26-038
Inputs:  [{'InputName': 'input-1', 'S3Input': {'S3Uri': 's3://sagemaker-soln-gfd-js-kkyxi65e-243108169697-us-east-1/dgl-fraud-detection/raw-data', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-243108169697/sagemaker-soln-graph-fraud-preprocessin-2021-02-10-04-29-26-038/input/code/graph_data_preprocessor.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'output-1', 'S3Output': {'S3Uri': 's3://sagemaker-soln-gfd-js-kkyxi65e-243108169697-us-east-1/dgl-fraud-detection/preprocessed-data', 'LocalPath': '/opt/ml/processing/output', 'S3UploadMode': 'EndOfJob'}}]
......................

### View Results of Data Preprocessing


In [6]:
from os import path
from sagemaker.s3 import S3Downloader
processed_files = S3Downloader.list(train_data)
print("===== Processed Files =====")
print('\n'.join(processed_files))

# optionally download processed data
# S3Downloader.download(train_data, train_data.split("/")[-1])

===== Processed Files =====
s3://sagemaker-soln-gfd-js-kkyxi65e-243108169697-us-east-1/dgl-fraud-detection/preprocessed-data/features.csv
s3://sagemaker-soln-gfd-js-kkyxi65e-243108169697-us-east-1/dgl-fraud-detection/preprocessed-data/relation_DeviceID_edgelist.csv
s3://sagemaker-soln-gfd-js-kkyxi65e-243108169697-us-east-1/dgl-fraud-detection/preprocessed-data/relation_IpAddress_edgelist.csv
s3://sagemaker-soln-gfd-js-kkyxi65e-243108169697-us-east-1/dgl-fraud-detection/preprocessed-data/relation_PhoneNo_edgelist.csv
s3://sagemaker-soln-gfd-js-kkyxi65e-243108169697-us-east-1/dgl-fraud-detection/preprocessed-data/relation_TransactionID_edgelist.csv
s3://sagemaker-soln-gfd-js-kkyxi65e-243108169697-us-east-1/dgl-fraud-detection/preprocessed-data/relation_card_no_edgelist.csv
s3://sagemaker-soln-gfd-js-kkyxi65e-243108169697-us-east-1/dgl-fraud-detection/preprocessed-data/relation_card_type_edgelist.csv
s3://sagemaker-soln-gfd-js-kkyxi65e-243108169697-us-east-1/dgl-fraud-detection/preprocess

## Train Graph Neural Network with DGL



### Hyperparameters


The following hyperparameters can be tuned and adjusted to improve model performance
* **batch-size** is the number nodes that are used to compute a single forward pass of the GNN

* **embedding-size** is the size of the embedding dimension for non target nodes
* **n-neighbors** is the number of neighbours to sample for each target node during graph sampling for mini-batch training
* **n-layers** is the number of GNN layers in the model
* **n-epochs** is the number of training epochs for the model training job
* **optimizer** is the optimization algorithm used for gradient based parameter updates
* **lr** is the learning rate for parameter updates


In [7]:
edges = ",".join(map(lambda x: x.split("/")[-1], [file for file in processed_files if "relation" in file]))
params = {'nodes' : 'features.csv',
          'edges': 'relation*',
          'labels': 'tags.csv',
          'model': 'rgcn',
          'num-gpus': 1,
          'batch-size': 10000,
          'embedding-size': 64,
          'n-neighbors': 1000,
          'n-layers': 2,
          'n-epochs': 10,
          'optimizer': 'adam',
          'lr': 1e-2
        }

print("Graph will be constructed using the following edgelists:\n{}" .format('\n'.join(edges.split(","))))

Graph will be constructed using the following edgelists:
relation_DeviceID_edgelist.csv
relation_IpAddress_edgelist.csv
relation_PhoneNo_edgelist.csv
relation_TransactionID_edgelist.csv
relation_card_no_edgelist.csv
relation_card_type_edgelist.csv
relation_email_domain_edgelist.csv


### Create and Fit SageMaker Estimator


In [8]:
from sagemaker.mxnet import MXNet
from time import strftime, gmtime

estimator = MXNet(entry_point='train_dgl_mxnet_entry_point.py',
                  source_dir='sagemaker_graph_fraud_detection/dgl_fraud_detection',
                  role=role, 
                  train_instance_count=1, 
                  train_instance_type='ml.g4dn.xlarge',
                  framework_version="1.6.0",
                  py_version='py3',
                  hyperparameters=params,
                  output_path=train_output,
                  code_location=train_output,
                  sagemaker_session=sess)

training_job_name = "{}-{}".format(config.solution_prefix, strftime("%Y-%m-%d-%H-%M-%S", gmtime()))
estimator.fit({'train': train_data}, job_name=training_job_name)

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


2021-02-10 04:34:23 Starting - Starting the training job...
2021-02-10 04:34:27 Starting - Launching requested ML instances......
2021-02-10 04:35:40 Starting - Preparing the instances for training......
2021-02-10 04:36:26 Downloading - Downloading input data......
2021-02-10 04:37:39 Training - Downloading the training image............
2021-02-10 04:39:41 Training - Training image download completed. Training in progress.[34m2021-02-10 04:39:42,355 sagemaker-training-toolkit INFO     Imported framework sagemaker_mxnet_container.training[0m
[34m2021-02-10 04:39:42,376 sagemaker_mxnet_container.training INFO     MXNet training environment: {'SM_HOSTS': '["algo-1"]', 'SM_NETWORK_INTERFACE_NAME': 'eth0', 'SM_HPS': '{"batch-size":10000,"edges":"relation*","embedding-size":64,"labels":"tags.csv","lr":0.01,"model":"rgcn","n-epochs":10,"n-layers":2,"n-neighbors":1000,"nodes":"features.csv","num-gpus":1,"optimizer":"adam"}', 'SM_USER_ENTRY_POINT': 'train_dgl_mxnet_entry_point.py', 'SM_FRA