### AWS Sagemaker XGBoost Partial Dependency

Purpose: built partial dependency for all configured features and models. 
Main Steps:
1. Build separate datasets for each feature as a combination of each unique feature value with all other available feature combinations. Processing job is used.
2. Create Sagemaker Models
3. Run batch transform jobs to make inference from all files created at step 1
4. Average the results of the batch transform jobs per model, feature and feature value.

In [1]:
temp_folder='/home/kate/Research/Property/Notebooks/Experiments/tmp/'
Experiments_file='/home/kate/Research/Property/Notebooks/Experiments/Logs/Set1-Classification.xlsx'
AllExperiments_tab='Experiments'
Experiment_name='BaseFeaturesPD'
#Experiment configuration:  
#1.features we want to produce PartialDependency
#each line in the file contains a feature name and a type:Categorical or Continuous
#Categorical features values will be process all as is. Continuous only sorted(np.linspace(np.percentile(dataset_temp[f], 0.1),np.percentile(dataset_temp[f], 99.5),50))
Experiment_Features_tab='%s Features'%Experiment_name
#2.The full set of features the tested model is based on
#each line in the file contains the model name and set of features to built a dataset for SageMaker
Experiment_tab='%s Models'%Experiment_name
#3.ModelFiles: each line is a model name (Model) and full model file name (ModelData - model.tar.gz) in an S3 bucket
Experiment_ModelFiles_tab='%s ModelFiles'%Experiment_name

Trial_name_preprocessing='%s-PreparingData'%Experiment_name
Trial_name_inference='%s-Inference'%Experiment_name
Trial_name_postprocessing='%s-PostProcessing'%Experiment_name


 
bucket='kdproperty'
path_to_data='Data'
path_to_pd_input_data='Data/Experiments/%s/pd_input_data'%Experiment_name
path_to_pd_output_data='Data/Experiments/%s/pd_output_data'%Experiment_name
path_to_pd_final_data='Data/Experiments/%s/pd_final_data'%Experiment_name
path_to_configuration='Config'

#the data files to process continuos features can be very large. To be able to run more then 1 instance for batch transform jobs and speed up the process
#we can split the continuos feature file into split_to_N_parts
split_to_N_parts=5

instance_type_preprocessing='ml.t3.2xlarge'
instance_count_preprocessing=1

instance_type_inference='ml.m5.xlarge'
#can be number of features for pd: each file with data for a specific feature and a correspondent instance for inference 
#or if there is a continuos feature with a lot of possible values, the huge file is splitted for N parts
#then instance count is num of categorical features + split_to_N_parts * num of continuous features
instance_count_inference=16 


instance_type_postprocessing='ml.t3.2xlarge'
instance_count_postprocessing=1

In [2]:
import boto3
import os
import sys
import time
import pandas as pd
import numpy as np

import re

#to read data from S3 with pandas
import s3fs

import matplotlib.pyplot as plt

import sagemaker
from sagemaker import get_execution_role
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

from sagemaker.xgboost.model import XGBoostModel


In [3]:
region = boto3.session.Session().region_name
role = 'arn:aws:iam::757107622481:role/service-role/AmazonSageMaker-ExecutionRole-20200819T131882'
sagemaker_session = sagemaker.session.Session(default_bucket=bucket)
s3 = s3fs.S3FileSystem()

In [4]:
#sys.path.append('/home/kate/Research/YearBuilt/Notebooks/Experiments')
import ExperimentsUtils as eu

1. Reading experiment configuration from an excel file

In [5]:
experiments = pd.read_excel(open(Experiments_file, 'rb'), sheet_name=AllExperiments_tab)

1.1 Target variable and data file name

In [6]:
target=experiments[experiments['Experiment']==Experiment_name]['Target'].values[0]
print('Target of models in %s experiment is %s'%(Experiment_name,target))
data_file=experiments[experiments['Experiment']==Experiment_name]['Dataset'].values[0]
print('Datafile used in %s experiment is %s'%(Experiment_name,data_file))

Target of models in BaseFeaturesPD experiment is hasclaim
Datafile used in BaseFeaturesPD experiment is property_water_claims_non_cat_fs_v5.csv


1.2 Features to produce partial dependency

In [7]:
features_for_pd = pd.read_excel(open(Experiments_file, 'rb'), sheet_name=Experiment_Features_tab)
features_for_pd

Unnamed: 0,Feature,Type
0,poolind,Categorical
1,usagetype_encd,Categorical
2,landlordind,Categorical
3,replacementcostdwellingind,Categorical
4,pipe_froze_3_blk,Categorical
5,customer_cnt_active_policies,Categorical
6,cova_limit,Categorical
7,plumb_leak_3_blk,Categorical


1.3 Model features. The process will use this info to create a data file for each feature

In [8]:
model_all_features = pd.read_excel(open(Experiments_file, 'rb'), sheet_name=Experiment_tab)
model_all_features

Unnamed: 0,Model,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12
0,BaseModel,cal_year-yearbuilt,cova_deductible,sqft,customer_cnt_active_policies,usagetype_encd,replacementcostdwellingind,pipe_froze_3_blk,landlordind,water_risk_3_blk,poolind,cova_limit,plumb_leak_3_blk


1.4 Model files (usually model.tar.gz produced from training)
Later SageMaker Models will be created  based on this info. 

In [9]:
model_files = pd.read_excel(open(Experiments_file, 'rb'), sheet_name=Experiment_ModelFiles_tab)
model_files

Unnamed: 0,Model,ModelData
0,BaseModel,s3://kdproperty/Models/Experiments/FeatureSet/...


1.5.Verification if we have the same set of features and models in both configurations

In [10]:
models_from_model_features=model_all_features['Model'].tolist()
models_from_models_files=model_files['Model'].tolist()
if len([x for x in models_from_model_features if x not in models_from_models_files])!=0:
    raise Exception('Different set of models in featuresets and files!')

In [11]:
models_features_for_pd=features_for_pd['Feature'].tolist()
for index, row in model_all_features.iterrows():
    model_complete_featureset=row[1:51].tolist()
    model_complete_featureset=[x for x in model_complete_featureset if str(x) != 'nan']  
    if len([x for x in models_features_for_pd if x not in model_complete_featureset])!=0:
        raise Exception('Different set of features in models all features and features for partial dependency!')

2.Saving into S3 models configurations (sets of features) to be used in data preprocessing

In [12]:
Model_Config_file='%s.csv'%Experiment_name
Models_Config_path = os.path.join(temp_folder, Model_Config_file) 

model_all_features.to_csv(Models_Config_path, header=True, index=False)


input_code = sagemaker_session.upload_data(
        Models_Config_path,
        bucket=bucket,
        key_prefix=path_to_configuration
    )

3.Creating experiments and trials in SageMaker

In [13]:
eu.cleanup_experiment(Experiment_name)
eu.create_experiment(Experiment_name)
eu.create_trial(Experiment_name,Trial_name_preprocessing)
eu.create_trial(Experiment_name,Trial_name_inference)
eu.create_trial(Experiment_name,Trial_name_postprocessing)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials

4. Preparing datasets for calculating partial dependency
The process loop thru each model in teh configuration file (saved into S3 above) and each feature. It creates a combination of each unique feature value and the rest sets of features available in the dataset. Dependeing on the feature type (Categorical or Continuos), the full set of unique feature values is used for categorical or a subset for continuos. There is only one created for categorical features and  continuos features files can be splitted into parts.

In [14]:
%%writefile preprocessingDataForPartialDependency.py

#The code creates a separate dataset for each feature with all possible combination of feature values and the rest of the data
#dataset for SageMaker are the same structure: no headers, the first column is a target and the rest are features


import argparse
import os
import pandas as pd
import numpy as np


if __name__=='__main__':
    
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_file', type=str)
    parser.add_argument('--config_file', type=str)   
    parser.add_argument('--featureset', type=str)    
    parser.add_argument('--featuretypes', type=str) 
    parser.add_argument('--split_to_N_parts', type=int, default=1)
    args, _ = parser.parse_known_args()    
    print('Received arguments {}'.format(args))
    

    featureset=args.featureset.split(',')
    featuretypes=args.featuretypes.split(',')
    split_to_N_parts=args.split_to_N_parts
    input_data_path = os.path.join('/opt/ml/processing/input', args.data_file)
    config_data_path = os.path.join('/opt/ml/processing/config', args.config_file)


    
    print('Reading input data from {}'.format(input_data_path))
    dataset = pd.read_csv(input_data_path, error_bad_lines=False, index_col=False)
    

    print('Reading config data from {}'.format(config_data_path))
    models = pd.read_csv(config_data_path, error_bad_lines=False, index_col=False)   
    
     
    #iterating thru config file with models and featureset
    for index, row in models.iterrows():
        model=row['Model']
        print (index, ': Creating featuresets for model %s'%model)
        model_complete_featureset=row[1:51].tolist()
        model_complete_featureset=[x for x in model_complete_featureset if str(x) != 'nan']
        #specific folder for each model data
        if not os.path.exists('/opt/ml/processing/output/%s'%model):
            os.makedirs('/opt/ml/processing/output/%s'%model)
        #iterating thru features for pd
        for feature,ftype in zip(featureset,featuretypes):
            print(feature,ftype)
            dataset_feature = pd.DataFrame()    
            dataset_temp = pd.DataFrame()
            for f in model_complete_featureset:
                dataset_temp[f]=dataset.eval(f)
            if ftype=='Continuous':
                # continuous
                grid = sorted(np.linspace(np.percentile(dataset_temp[feature], 0.1),
                       np.percentile(dataset_temp[feature], 99.5),
                          50))
            else:
                #categorical
                grid = sorted(dataset_temp[feature].unique())        
 
            for i, val in enumerate(grid):
                dataset_temp[feature] = val
                dataset_feature=dataset_feature.append(dataset_temp)
            #save in parts if large dataset
            if ftype=='Continuous':
                parts = np.array_split(dataset_feature, split_to_N_parts)
            
                for i,p in enumerate(parts):
                    output_data_path = os.path.join('/opt/ml/processing/output/%s'%model, '%s_%s.csv'%(feature,i))
                    p.to_csv(output_data_path,header=False,index=False)
            else:   
                output_data_path = os.path.join('/opt/ml/processing/output/%s'%model, '%s.csv'%feature)
                dataset_feature.to_csv(output_data_path,header=False,index=False)
        

Overwriting preprocessingDataForPartialDependency.py


In [15]:
processors=list()

featureset=features_for_pd['Feature'].tolist()
featuretypes=features_for_pd['Type'].tolist()

data_processor = SKLearnProcessor(framework_version='0.20.0',
                                     role=role,
                                     instance_type=instance_type_preprocessing,
                                     instance_count=instance_count_preprocessing)
    
data_processor.run(code='preprocessingDataForPartialDependency.py',
                        inputs= [ProcessingInput(input_name='data',source='s3://%s/%s/%s'%(bucket,path_to_data,data_file),destination='/opt/ml/processing/input'),
                                ProcessingInput(input_name='config',source='s3://%s/%s/%s'%(bucket,path_to_configuration,Model_Config_file),destination='/opt/ml/processing/config'),
                                ],
                        outputs=[
                                ProcessingOutput(output_name='output', source='/opt/ml/processing/output', destination='s3://%s/%s/'%(bucket,path_to_pd_input_data)),                                                          
                                ],
                        arguments=['--data_file',data_file,
                                '--config_file',Model_Config_file,
                                 '--featureset', ','.join(featureset).replace(' ',''),
                                 '--featuretypes', ','.join(featuretypes).replace(' ',''),
                                 '--split_to_N_parts',str(split_to_N_parts)],
                        experiment_config = {
        'ExperimentName': Experiment_name ,
        'TrialName' : Trial_name_preprocessing,
        'TrialComponentDisplayName' : Trial_name_preprocessing},
                        wait=True
                        )
processors.append(data_processor)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker:Creating processing-job with name sagemaker-scikit-learn-2021-05-31-01-28-23-447



Job Name:  sagemaker-scikit-learn-2021-05-31-01-28-23-447
Inputs:  [{'InputName': 'data', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://kdproperty/Data/property_water_claims_non_cat_fs_v5.csv', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'config', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://kdproperty/Config/BaseFeaturesPD.csv', 'LocalPath': '/opt/ml/processing/config', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-757107622481/sagemaker-scikit-learn-2021-05-31-01-28-23-447/input/code/preprocessingDataForPartialDependency.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionT

In [16]:
#Stop the execution if there is an issue with creating input data for the models
job_name=data_processor.jobs[-1].describe()['ProcessingJobName']
if not(sagemaker_session.was_processing_job_successful(job_name)):
    raise Exception('Preprocessing job Failed!')    

5. Running inference jobs to produce partial dependency (prediction)

5.1. Script for inference. The file must have name inference.py!!!!

In [17]:
%%writefile inference.py 
import json
import os
import pickle as pkl

import numpy as np

import sagemaker_xgboost_container.encoder as xgb_encoders


def model_fn(model_dir):
    """
    Deserialize and return fitted model.
    """
    model_file = "xgboost-model"
    booster = pkl.load(open(os.path.join(model_dir, model_file), "rb"))
    return booster    


def input_fn(request_body, request_content_type):
    """
    The SageMaker XGBoost model server receives the request data body and the content type,
    and invokes the `input_fn`.

    Return a DMatrix (an object that can be passed to predict_fn).
    """
    if request_content_type == "text/csv":
        return xgb_encoders.csv_to_dmatrix(request_body.rstrip('\n').lstrip('\n'))
    else:
        raise ValueError(
            "Content type {} is not supported.".format(request_content_type)
        )


def predict_fn(input_data, model):
    """
    SageMaker XGBoost model server invokes `predict_fn` on the return value of `input_fn`.

    Return a two-dimensional NumPy array where the first columns are predictions
    """
    prediction = model.predict(input_data)
    return  prediction

Writing inference.py


5.2. Creating models in SageMaker to be used in interference(prediction) based on model files provided in models_ModelFiles

In [18]:
models = list()
model_names = list()
i = 0
for index, row in model_files.iterrows():
    #Try to delete if exists model and create a new model based on a model file
    name=row['Model']
    name=name.replace('_','-')
    model_data=row['ModelData']
    print(name,model_data)
    try:
        response = smclient.delete_model(ModelName=name)
        print('%s model was deleted'%name)
    except:
        print('%s model does not exist'%name)
        pass
    xgb_inference_model = XGBoostModel(
    name=name,
    model_data=model_data,
    role=role,
    entry_point='inference.py',
    framework_version="1.0-1",
    )
    models.append(xgb_inference_model)
    model_names.append(name)
    print('%s model was created'%name)
    i = i + 1  

BaseModel s3://kdproperty/Models/Experiments/FeatureSet/BaseModel-0-2021-05-30-05-32-15/output/model.tar.gz
BaseModel model does not exist
BaseModel model was created


5.3. Running transform jobs using inference.py script and models created above

In [19]:
tranform_jobs = list()
tranformers = list()
i = 0
for m,model_name in zip(models,model_names):   
    s3_batch_input='s3://%s/%s/%s'%(bucket,path_to_pd_input_data,model_name)
    s3_batch_output_model = 's3://%s/%s/%s'%(bucket,path_to_pd_output_data,model_name)
    print(model_name)
    transformer =  m.transformer(
                                              instance_count=instance_count_inference, 
                                              instance_type=instance_type_inference,
                                              output_path=s3_batch_output_model,
                                              accept='text/csv',
                                              strategy='MultiRecord',
                                              assemble_with='Line'
                                             )
    tranformers.append(transformer)
    transformer.transform(data=s3_batch_input, content_type='text/csv',split_type='Line', wait=False,
    experiment_config = {
        'ExperimentName': Experiment_name ,
        'TrialName' : Trial_name_inference,
        'TrialComponentDisplayName' : '%s-%s'%(Trial_name_inference,model_name.replace('_','-')),})
    job_name = transformer.latest_transform_job.name
    tranform_jobs.append(job_name)
    print('Job %s started'%job_name)
    i = i + 1

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


BaseModel


INFO:sagemaker:Creating model with name: BaseModel
INFO:sagemaker:Creating transform job with name: BaseModel-2021-05-31-02-10-12-096


Job BaseModel-2021-05-31-02-10-12-096 started


In [20]:
eu.wait_transform_jobs(processors=tranformers,tranform_jobs=tranform_jobs,check_every_sec=10,print_every_n_output=20,wait_min=60)

Transforming job BaseModel-2021-05-31-02-10-12-096 status: InProgress
Continue waiting...
Transforming job BaseModel-2021-05-31-02-10-12-096 status: InProgress
Continue waiting...
Transforming job BaseModel-2021-05-31-02-10-12-096 status: InProgress
Continue waiting...
Transforming job BaseModel-2021-05-31-02-10-12-096 status: InProgress
Continue waiting...
Transforming job BaseModel-2021-05-31-02-10-12-096 status: InProgress
Continue waiting...
Transforming job BaseModel-2021-05-31-02-10-12-096 status: InProgress
Continue waiting...
Transforming job BaseModel-2021-05-31-02-10-12-096 status: InProgress
Continue waiting...
Transforming job BaseModel-2021-05-31-02-10-12-096 status: InProgress
Continue waiting...
All Transforming Jobs are Completed


6. Post processing partial dependency results (averaging by value)

In [21]:
%%writefile postprocessingPartialDependencyData.py

#The code joins InputData files for each feature and inference from each model and then average by each feature value


import argparse
import os
import pandas as pd
import numpy as np


if __name__=='__main__':
    
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_file', type=str)
    parser.add_argument('--config_file', type=str)   
    parser.add_argument('--featureset', type=str)   
    parser.add_argument('--featuretypes', type=str)  
    parser.add_argument('--split_to_N_parts', type=int, default=1)
    args, _ = parser.parse_known_args()    
    print('Received arguments {}'.format(args))
    
    
    featureset=args.featureset.split(',')
    featuretypes=args.featuretypes.split(',')
    split_to_N_parts=args.split_to_N_parts
    input_data_path = os.path.join('/opt/ml/processing/input', args.data_file)
    config_data_path = os.path.join('/opt/ml/processing/config', args.config_file)


    output_data_path = os.path.join('/opt/ml/processing/output', 'data.csv', )
    
    print('Reading input data from {}'.format(input_data_path))
    dataset = pd.read_csv(input_data_path, error_bad_lines=False, index_col=False)
    

    print('Reading config data from {}'.format(config_data_path))
    models = pd.read_csv(config_data_path, error_bad_lines=False, index_col=False)   

    #final dataset - average pd by each feature value
    all_fm_pd = pd.DataFrame()

        
    for f,ftype in zip(featureset,featuretypes):
        print('Processing %s'%f)
        #iterating thru config file with models and featureset
        df_all_models=pd.DataFrame()
        for index, row in models.iterrows():
            model=row['Model']
            print (index, ': Creating featuresets for model %s'%model)
            model_complete_featureset=row[1:51].tolist()
            model_complete_featureset=[x for x in model_complete_featureset if str(x) != 'nan']
        
            if ftype=='Continuous':
                feature_InputData_dataset=pd.DataFrame()
                #Continious data can be splitted to split_to_N_parts files
                for j in range(0,split_to_N_parts):
                    feature_InputData_path=os.path.join('/opt/ml/processing/input/InputData/%s'%model, '%s_%s.csv'%(f,j))
                    feature_InputData_dataset_j = pd.read_csv(feature_InputData_path, names=model_complete_featureset, error_bad_lines=False, index_col=False)
                    feature_InputData_dataset=feature_InputData_dataset.append(feature_InputData_dataset_j)          
            else:
                feature_InputData_path=os.path.join('/opt/ml/processing/input/InputData/%s'%model, '%s.csv'%f)
                feature_InputData_dataset = pd.read_csv(feature_InputData_path, names=model_complete_featureset, error_bad_lines=False, index_col=False)
            
            fm_pd = pd.DataFrame()           
            print('Reading predicted data from model %s'%model)
            if ftype=='Continuous':
                pdf_dataset=pd.DataFrame()
                #Continious data can be splitted to split_to_N_parts files
                for j in range(0,split_to_N_parts):
                    model_predicted_data_path =os.path.join('/opt/ml/processing/input/PartialDependency/%s'%model.replace('_','-'),'%s_%s.csv.out'%(f,j))
                    pdf_dataset_i = pd.read_csv(model_predicted_data_path, names=['pd'], error_bad_lines=False, index_col=False)
                    pdf_dataset=pdf_dataset.append(pdf_dataset_i)
            else:
                model_predicted_data_path =os.path.join('/opt/ml/processing/input/PartialDependency/%s'%model.replace('_','-'),'%s.csv.out'%f)                
                pdf_dataset = pd.read_csv(model_predicted_data_path, names=['pd'], error_bad_lines=False, index_col=False)
            #model feature partial dependency columns name
            pd_column_name='%s_pd'%model
            feature_InputData_dataset[pd_column_name]= pdf_dataset['pd'].values
            
            #average
            fm_s = feature_InputData_dataset.groupby(f)[pd_column_name].mean()
            fm_pd_model=pd.DataFrame({'value':fm_s.index, pd_column_name:fm_s.values})           
            fm_pd_model['feature']=f
            fm_pd_model = fm_pd_model[['feature','value',pd_column_name]]

            fm_pd=pd.concat([fm_pd,fm_pd_model],axis=1)          
            #add text value for categorical encd columns
            #assuming there is encoded (_encd ended) and original values in the dataset
            fm_pd['value2']=fm_pd['value'].astype(str)
            if '_encd' in f and f.replace('_encd','') in dataset.columns:
                #unique combindation of codes and original values from the main dataset into list and then dictionary
                dataset['dummy']= dataset[f.replace('_encd','')] +'-'+ dataset[f].astype(str)
                unique_comb_l=dataset['dummy'].unique().tolist()
                unique_comb_value=[i.split('-', 1)[0] for i in unique_comb_l]
                unique_comb_key=[i.split('-', 1)[1] for i in unique_comb_l]
                unique_comb_dict = dict(zip(unique_comb_key, unique_comb_value))
                #replace value2 in the feature values and partial dependencies
                fm_pd['value2'].replace(unique_comb_dict, inplace=True)
            if len(df_all_models)==0:
                df_all_models=fm_pd
            else:
                df_all_models = pd.merge(df_all_models,fm_pd, on=['feature','value','value2'], how='outer')    

        all_fm_pd=all_fm_pd.append(df_all_models)
    #saving final output    
    all_fm_pd.to_csv(output_data_path,header=True ,index=False)    

Writing postprocessingPartialDependencyData.py


In [22]:
processors=list()

featureset=features_for_pd['Feature'].tolist()
featuretypes=features_for_pd['Type'].tolist()


data_processor = SKLearnProcessor(framework_version='0.20.0',
                                     role=role,
                                     instance_type=instance_type_postprocessing,
                                     instance_count=instance_count_postprocessing)

data_processor.run(code='postprocessingPartialDependencyData.py',
                        inputs= [
                        #1.Main dataset
                        ProcessingInput(input_name='input',source='s3://%s/%s/%s'%(bucket,path_to_data,data_file),destination='/opt/ml/processing/input'),  
                        #2.Models config file
                        ProcessingInput(input_name='config',source='s3://%s/%s/%s'%(bucket,path_to_configuration,Model_Config_file),destination='/opt/ml/processing/config'),
                        #3.preprocessed features data for inference     
                        ProcessingInput(input_name='InputData', source='s3://%s/%s'%(bucket,path_to_pd_input_data), destination='/opt/ml/processing/input/InputData'),
                        #4.inference (predicted) data
                        ProcessingInput(input_name='PartialDependency', source='s3://%s/%s'%(bucket,path_to_pd_output_data), destination='/opt/ml/processing/input/PartialDependency')    
                                 ],
                        outputs=[ProcessingOutput(output_name='output', source='/opt/ml/processing/output', destination='s3://%s/%s'%(bucket,path_to_pd_final_data)),                                                          
                                ],
                        arguments=['--data_file',data_file,
                                '--config_file',Model_Config_file,
                                 '--featureset', ','.join(featureset).replace(' ',''),
                                 '--featuretypes', ','.join(featuretypes).replace(' ',''),
                                 '--split_to_N_parts',str(split_to_N_parts)],
                        experiment_config = {
        'ExperimentName': Experiment_name ,
        'TrialName' : Trial_name_postprocessing,
        'TrialComponentDisplayName' : Trial_name_postprocessing},
                        wait=True
                        )
processors.append(data_processor)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker:Creating processing-job with name sagemaker-scikit-learn-2021-05-31-02-39-26-167



Job Name:  sagemaker-scikit-learn-2021-05-31-02-39-26-167
Inputs:  [{'InputName': 'input', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://kdproperty/Data/property_water_claims_non_cat_fs_v5.csv', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'config', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://kdproperty/Config/BaseFeaturesPD.csv', 'LocalPath': '/opt/ml/processing/config', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'InputData', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://kdproperty/Data/Experiments/BaseFeaturesPD/pd_input_data', 'LocalPath': '/opt/ml/processing/input/InputData', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'PartialDependency', 'AppManaged':

In [23]:
#Stop the execution if there is an issue with creating input data for the models
job_name=data_processor.jobs[-1].describe()['ProcessingJobName']
if not(sagemaker_session.was_processing_job_successful(job_name)):
    raise Exception('Preprocessing job Failed!')  

In [26]:
pd_output_file='s3://%s/%s/data.csv'%(bucket,path_to_pd_final_data)

7. Read data from S3 and visualize and save to a log
- header as hardcoded in postprocessingPartialDependencyData.py: feature name, feature value, models partial dependency value (as we cofigured in the experiment), feature original value if encoded
- the output file name (required in this case) is data.csv, hardcoded in postprocessingPartialDependencyData.py

In [27]:
pdf_dataset = pd.read_csv(pd_output_file,  error_bad_lines=False, index_col=False) 
pdf_dataset.head()

Unnamed: 0,feature,value,BaseModel_pd,value2
0,poolind,0,0.010496,0
1,poolind,1,0.010546,1
2,usagetype_encd,1,0.010181,UNOCCUPIED
3,usagetype_encd,2,0.010181,SECONDARY
4,usagetype_encd,3,0.010181,SEASONAL


In [29]:
#Saving into the Experiment log or load to redshift directly
eu.SaveToExperimentLog(Experiments_file, '%s Data'%Experiment_name, pdf_dataset)

8. Averaging partial dependency by all models (assuming models are parts of a cv-folds)

In [30]:
pdf_dataset['pd']=0
for index, row in model_files.iterrows():
    m=row['Model']
    pdf_dataset['pd'] = pdf_dataset['pd'] + pdf_dataset['%s_pd'%m]
pdf_dataset['pd'] = pdf_dataset['pd']/len(model_files)
pdf_dataset.head()

INFO:numexpr.utils:NumExpr defaulting to 8 threads.


Unnamed: 0,feature,value,BaseModel_pd,value2,pd
0,poolind,0,0.010496,0,0.010496
1,poolind,1,0.010546,1,0.010546
2,usagetype_encd,1,0.010181,UNOCCUPIED,0.010181
3,usagetype_encd,2,0.010181,SECONDARY,0.010181
4,usagetype_encd,3,0.010181,SEASONAL,0.010181


9. Visualization Partial Dependency with bokeh

In [36]:
import bokeh
import bokeh.io
bokeh.io.output_notebook()
from bokeh.plotting import  figure, show
from bokeh.models import HoverTool   
from bokeh.io import export_png
class HoverHelper():

    def hovertool(self):
        tooltips = [
            ('pd','@pd'),
            ('value','@value2'),
        ]
        ht = HoverTool(tooltips=tooltips)
        return ht
    def tools(self, standard_tools='pan,crosshair,wheel_zoom,zoom_in,zoom_out,undo,reset'):
        return [self.hovertool(), standard_tools]
hover = HoverHelper()    

In [65]:
pdf_dataset[pdf_dataset['feature']=='poolind']

Unnamed: 0,feature,value,BaseModel_pd,value2,pd
0,poolind,0,0.010496,0,0.010496
1,poolind,1,0.010546,1,0.010546


In [56]:
f='poolind'
p = figure(plot_width=900, 
               plot_height=400, 
               tools=hover.tools(),
               x_axis_label='Values',
               y_axis_label='Partial Dependency',
               title='%s Partial Dependency'%f)
p.circle(source=pdf_dataset[pdf_dataset['feature']==f], x='value2', y='pd')
export_png(p, filename=temp_folder+f+'.png')
show(p)

In [44]:
pdf_dataset[pdf_dataset['feature']=='usagetype_encd']

Unnamed: 0,feature,value,BaseModel_pd,value2,pd
2,usagetype_encd,1,0.010181,UNOCCUPIED,0.010181
3,usagetype_encd,2,0.010181,SECONDARY,0.010181
4,usagetype_encd,3,0.010181,SEASONAL,0.010181
5,usagetype_encd,4,0.010165,VACANT,0.010165
6,usagetype_encd,5,0.010165,COC,0.010165
7,usagetype_encd,6,0.010165,RENTAL,0.010165
8,usagetype_encd,7,0.010728,PRIMARY,0.010728


In [57]:
f='usagetype_encd'
p = figure(plot_width=900, 
               plot_height=400, 
               tools=hover.tools(),
               x_axis_label='Values',
               y_axis_label='Partial Dependency',
               title='%s Partial Dependency'%f)
p.circle(source=pdf_dataset[pdf_dataset['feature']==f], x='value', y='pd')
export_png(p, filename=temp_folder+f+'.png')
show(p)

In [47]:
pdf_dataset[pdf_dataset['feature']=='landlordind']

Unnamed: 0,feature,value,BaseModel_pd,value2,pd
9,landlordind,0,0.0105,0,0.0105
10,landlordind,1,0.0105,1,0.0105


In [58]:
f='landlordind'
p = figure(plot_width=900, 
               plot_height=400, 
               tools=hover.tools(),
               x_axis_label='Values',
               y_axis_label='Partial Dependency',
               title='%s Partial Dependency'%f)
p.circle(source=pdf_dataset[pdf_dataset['feature']==f], x='value', y='pd')
export_png(p, filename=temp_folder+f+'.png')
show(p)

In [49]:
pdf_dataset[pdf_dataset['feature']=='replacementcostdwellingind']

Unnamed: 0,feature,value,BaseModel_pd,value2,pd
11,replacementcostdwellingind,0,0.0105,0,0.0105
12,replacementcostdwellingind,1,0.0105,1,0.0105


In [61]:
f='replacementcostdwellingind'
p = figure(plot_width=900, 
               plot_height=400, 
               tools=hover.tools(),
               x_axis_label='Values',
               y_axis_label='Partial Dependency',
               title='%s Partial Dependency'%f)
p.circle(source=pdf_dataset[pdf_dataset['feature']==f], x='value', y='pd')
export_png(p, filename=temp_folder+f+'.png')
show(p)

In [50]:
pdf_dataset[pdf_dataset['feature']=='pipe_froze_3_blk']

Unnamed: 0,feature,value,BaseModel_pd,value2,pd
13,pipe_froze_3_blk,0,0.010638,0,0.010638
14,pipe_froze_3_blk,1,0.010486,1,0.010486
15,pipe_froze_3_blk,2,0.010468,2,0.010468
16,pipe_froze_3_blk,3,0.010468,3,0.010468
17,pipe_froze_3_blk,4,0.010468,4,0.010468
18,pipe_froze_3_blk,5,0.010468,5,0.010468


In [62]:
f='pipe_froze_3_blk'
p = figure(plot_width=900, 
               plot_height=400, 
               tools=hover.tools(),
               x_axis_label='Values',
               y_axis_label='Partial Dependency',
               title='%s Partial Dependency'%f)
p.circle(source=pdf_dataset[pdf_dataset['feature']==f], x='value', y='pd')
export_png(p, filename=temp_folder+f+'.png')
show(p)

In [52]:
pdf_dataset[pdf_dataset['feature']=='customer_cnt_active_policies']

Unnamed: 0,feature,value,BaseModel_pd,value2,pd
19,customer_cnt_active_policies,1,0.010526,1,0.010526
20,customer_cnt_active_policies,2,0.010406,2,0.010406
21,customer_cnt_active_policies,3,0.010384,3,0.010384
22,customer_cnt_active_policies,4,0.010381,4,0.010381
23,customer_cnt_active_policies,5,0.010381,5,0.010381
...,...,...,...,...,...
81,customer_cnt_active_policies,125,0.010471,125,0.010471
82,customer_cnt_active_policies,132,0.010471,132,0.010471
83,customer_cnt_active_policies,133,0.010471,133,0.010471
84,customer_cnt_active_policies,135,0.010471,135,0.010471


In [63]:
f='customer_cnt_active_policies'
p = figure(plot_width=900, 
               plot_height=400, 
               tools=hover.tools(),
               x_axis_label='Values',
               y_axis_label='Partial Dependency',
               title='%s Partial Dependency'%f)
p.circle(source=pdf_dataset[pdf_dataset['feature']==f], x='value', y='pd')
export_png(p, filename=temp_folder+f+'.png')
show(p)

In [59]:
f='cova_limit'
p = figure(plot_width=900, 
               plot_height=400, 
               tools=hover.tools(),
               x_axis_label='Values',
               y_axis_label='Partial Dependency',
               title='%s Partial Dependency'%f)
p.circle(source=pdf_dataset[pdf_dataset['feature']==f], x='value', y='pd')
export_png(p, filename=temp_folder+f+'.png')
show(p)

In [60]:
pdf_dataset[pdf_dataset['feature']=='plumb_leak_3_blk']

Unnamed: 0,feature,value,BaseModel_pd,value2,pd
98,plumb_leak_3_blk,0,0.010483,0,0.010483
99,plumb_leak_3_blk,1,0.010483,1,0.010483
100,plumb_leak_3_blk,2,0.010483,2,0.010483
101,plumb_leak_3_blk,3,0.010483,3,0.010483
102,plumb_leak_3_blk,4,0.010492,4,0.010492
103,plumb_leak_3_blk,5,0.010534,5,0.010534


In [64]:
f='plumb_leak_3_blk'
p = figure(plot_width=900, 
               plot_height=400, 
               tools=hover.tools(),
               x_axis_label='Values',
               y_axis_label='Partial Dependency',
               title='%s Partial Dependency'%f)
p.circle(source=pdf_dataset[pdf_dataset['feature']==f], x='value', y='pd')
export_png(p, filename=temp_folder+f+'.png')
show(p)

In [66]:
lst_chart_filenames=list()
for index, row in features_for_pd.iterrows():
    lst_chart_filenames.append(temp_folder+row['Feature']+'.png')
eu.SaveChartToExperimentLog(Experiments_file, '%s Data'%Experiment_name, len(pdf_dataset)+10, 25, lst_chart_filenames)    