In [None]:
#Save "SageMaker Prediction with Shap Values.py" as inference.py in the same folder as this notebook.
#inference.py is a mandatory name and can not be changed

In [None]:
%time
models_data = ['s3://sagemaker-wc-class/model/fold_0/basemodel0-class-XGB-0-2020-10-29-22-41-41/output/model.tar.gz', 
             's3://sagemaker-wc-class/model/fold_1/basemodel0-class-XGB-1-2020-10-29-22-41-54/output/model.tar.gz', 
             's3://sagemaker-wc-class/model/fold_2/basemodel0-class-XGB-2-2020-10-29-22-42-07/output/model.tar.gz', 
             's3://sagemaker-wc-class/model/fold_3/basemodel0-class-XGB-3-2020-10-29-22-42-19/output/model.tar.gz', 
             's3://sagemaker-wc-class/model/fold_4/basemodel0-class-XGB-4-2020-10-29-22-42-31/output/model.tar.gz']
#where are training data located locally
DataDir = '/home/kate/Research/Property/Data/'
#Temp local dir to save file before moving to S3
TmpDir = '/home/kate/Research/Property/Notebooks/SageMaker/tmp/'
#dataset file name with features to predict
dataset_name='property_water_claims_non_cat_fs'
#model featureset
featureset  = [
'roofcd_encd',
'sqft',  
'usagetype_encd',
'yearbuilt',
'water_risk_3_blk',
'landlordind',
'multipolicyind'  
]
#column names for predicted values
prediction_column_cv='sm_basemodel0_class_xgb_cv'
transform_output_with_shapvalues  = [
'sm_basemodel0_class_xgb_fold',
'roofcd_encd_shap_value',
'sqft_shap_value',  
'usagetype_encd_shap_value',
'yearbuilt_shap_value',
'water_risk_3_blk_shap_value',
'landlordind_shap_value',
'multipolicyind_shap_value',
'expected_value'
]
#other columns to join from the original dataset
other_columns=[
    'modeldata_id',
    'cal_year'
]
#instance type to be created for transformation jobs. 
instance_type='ml.c5.xlarge'
#Transformation jobs
#Model is created for each fold model_name is just a template. fold number will be added 
transformation_job_entry_point='inference.py'
#timeout for waiting Shap Values
transformation_job_timeout = 3600
transformation_input_folder = 'input'
transformation_output_folder = 'output'
model_name='property-wcf-class-basemodel0'
#S3 bucket and folders to save files
bucket = 'sagemaker-wc-class'
s3_batch_output = 's3://%s/%s/%s_fold_'%(bucket,transformation_output_folder,model_name)
s3_batch_input = 's3://%s/%s/%s.csv'%(bucket,transformation_input_folder,dataset_name)

In [None]:
%%time
#packages used in the notebook
import time
import sys
import boto3
import re
import sagemaker
from sagemaker.session import s3_input
from sagemaker.xgboost.model import XGBoostModel
import pandas as pd
import numpy as np
import s3fs

In [None]:
%%time
#should be run as a first step
#role arn is used when run from a local machine
role = 'arn:aws:iam::'
region = boto3.Session().region_name
s3 = s3fs.S3FileSystem()
smclient = boto3.Session().client('sagemaker')

In [None]:
%%time
#data
dataset = pd.read_csv('%s%s.csv'%(DataDir,dataset_name), error_bad_lines=False, index_col=False)

In [None]:
%%time
#Load data to S3 bucket
if not(s3.exists(s3_batch_input)):
    print('%s.csv does not exist in S3. Loading...'%dataset_name)
    dataset[featureset].to_csv(s3_batch_input,header=False,index=False)
else:
    print('%s.csv exists in S3'%dataset_name)

In [None]:
%%time
#Models to be used in prediction
#based on model files provided in models_data
models = list()
i = 0
for m in models_data:    
    #Try to delete if exists model and create a new model based on a model file
    name=model_name+'-%s'%i
    try:
        response = smclient.delete_model(ModelName=name)
        print('%s model was deleted'%name)
    except:
        print('%s model does not exist'%name)
        pass
    xgb_inference_model = XGBoostModel(
    name=name,
    model_data=m,
    role=role,
    entry_point=transformation_job_entry_point,
    framework_version="1.0-1",
    )
    models.append(xgb_inference_model)
    print('%s model was created'%name)
    i = i + 1        

In [None]:
%time
tranform_jobs = list()
tranformers = list()
i = 0
for m in models:       
#Create transform job
    s3_batch_output_fold=s3_batch_output+'%s'%i
    transformer =  m.transformer(
                                              instance_count=1, 
                                              instance_type=instance_type,
                                              output_path=s3_batch_output_fold,
                                              accept='text/csv',
                                              strategy='MultiRecord',
                                              assemble_with='Line'
                                             ,env = {'SAGEMAKER_MODEL_SERVER_TIMEOUT' : str(transformation_job_timeout) }
                                             )
    tranformers.append(transformer)
    transformer.transform(data=s3_batch_input, content_type='text/csv',split_type='Line')
    job_name = transformer.latest_transform_job.name
    tranform_jobs.append(job_name)
    print('Job %s started'%job_name)
    i = i + 1

In [None]:
%time
#If there are not complete training jobs in 60 minutes, it's better to look into logs. Took 53 min 1 job for ~2M rows
check_every_sec=60
print_every_n_output=6
t = 0
n = 0
minutes_to_wait=60*60/check_every_sec
while True:
    statuses = list()
    n = n + 1
    for e,j in zip(tranformers,tranform_jobs):
        status=e.sagemaker_session.describe_transform_job(j)['TransformJobStatus']
        if n==print_every_n_output:
            print('Transform job %s status: %s'%(j,status))
        statuses.append(status)
    if 'InProgress' in statuses:
        if n==print_every_n_output:
            print('Continue waiting...')
            n = 0
    else:
        if set(statuses)=={'Completed'}:
            print('All Transform Jobs are Completed')
        else:
            print('Something went wrong.')
        break 
    t = t+1
    if t>minutes_to_wait:
        print('Something went wrong. Transform jobs are still running.')
        break
    time.sleep(check_every_sec)

In [None]:
%time
i = 0
kfold=len(models)
for m in models:   
    print('Reading predicted data fold: {}  of  {} : '.format(i+1, kfold))
    s3_batch_output_fold=s3_batch_output+'%s/%s.csv.out'%(i,dataset_name)
    SageMakerPrediction_dataset = pd.read_csv(s3_batch_output_fold, names=transform_output_with_shapvalues, error_bad_lines=False, index_col=False) 
    SageMakerPrediction_dataset['ModelName']=model_name
    SageMakerPrediction_dataset['fold']=i
    #other columns from the original dataset
    for f in featureset:
        SageMakerPrediction_dataset[f]=dataset[f]
    SageMakerPrediction_dataset = SageMakerPrediction_dataset[['ModelName','fold']+featureset+transform_output_with_shapvalues]
    SageMakerPrediction_dataset.to_csv('%s%s_%s_%s_sage_maker_shap_values.csv'%(DataDir,dataset_name,model_name,i),header=True,index=False)
    i = i+1