In [2]:
import os 
import boto3 
from sagemaker import Session  
from sagemaker import get_execution_role 

#Import libraries to handle data prep and feature engineering
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split 
from sklearn import preprocessing

#import libariries to train model in Sagemaker
from sagemaker.s3 import S3Uploader 
from sagemaker.inputs import TrainingInput 
from sagemaker.image_uris import retrieve 
from sagemaker.estimator import Estimator 

#import libraries for bias and explainability
from sagemaker import clarify 

In [4]:
# Get the session, region and role
session = Session() 
region = session.boto_region_name 
role = get_execution_role() 
s3_client = boto3.client("s3") 

# Set up prefix for data and outputs
bucket = "responsibleai"
data_prefix = "data"
bias_prefix = "bias_explain" 
input_data_path = "s3://{}/{}".format(bucket, data_prefix)
explainability_output_path = "s3://{}/{}/clarify-explainability".format(bucket, bias_prefix) 
bias_report_output_path = "s3://{}/{}/clarify-bias".format(bucket, bias_prefix) 

responsibleai


### Data Preparation & Feature Engineering

In [6]:
# Load Data set
training_data = pd.read_csv("../data/churn.csv").dropna() 
training_data.head() 

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [None]:
def number_encode_features(df): 
    result = df.copy() 
    encoders = {} 

    for column in result.columns: 
        if result.dtypes[column] == np.object: 
            encoders[column] = preprocessing.LabelEncoder() 
            result[column] = encoders[column].fit_transform(result[column].fillna("None")) 
    return result, encoders

In [None]:
churn_train, churn_test = train_test_split (training_data, test_size=0.2)

churn_train = pd.concat([churn_train["Exited"], churn_train.drop(["Exited"], axis=1)], axis=1)
churn_train, _ = number_encode_features(churn_train)
churn_train.to_csv("../data/train_churn.csv", index=False, header=False)

churn_test, _ = number_encode_features(churn_test)
churn_features = churn_test.drop(["Exited"], axis=1)
churn_target = churn_test["Exited"]
churn_features.to_csv("../data/test_churn.csv", index=False, header=False)

train_uri = S3Uploader.upload("../data/train_churn.csv", "s3://{}/{}".format(bucket, prefix)) 
train_input = TrainingInput(train_uri, content_type="csv") 
test_uri = S3Uploader.upload("../data/test_churn.csv", "s3://{}/{}".format(bucket, prefix)) 

### Model Training

In [None]:
container = retrieve("xgboost", region, version="1.2-1") 

xgb = Estimator(container,
                role, 
                instance_count=1,
                instance_type="ml.m5.xlarge", 
                disable_profiler=True,
                sagemaker_session=session,) 

xgb.set_hyperparameters(max_depth=5, 
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        objective="binary:logistic",
                        num_round=800,) 

xgb.fit({"train": train_input}, logs=False) 

In [None]:
model_name = "churn-clarify-model" 
model = xgb.create_model(name=model_name) 
container_def = model.prepare_container_def() 
session.create_model(model_name, role, container_def) 

### Data Bias

In [None]:
clarify_processor = clarify.SageMakerClarifyProcessor( 
                        role=role, 
                        instance_count=1, 
                        instance_type="ml.m5.xlarge", 
                        sagemaker_session=session) 

In [None]:
bias_data_config = clarify.DataConfig( 
    s3_data_input_path=train_uri, 
    s3_output_path=bias_report_output_path, 
    label="Exited", 
    headers=churn_train.columns.to_list(), 
    dataset_type="text/csv") 

model_config = clarify.ModelConfig( 
    model_name=model_name, instance_type="ml.m5.xlarge", 
    instance_count=1,accept_type="text/csv", 
    content_type="text/csv",) 

predictions_config = clarify.ModelPredictedLabelConfig(probability_threshold=0.8)

bias_config = clarify.BiasConfig( 
    label_values_or_threshold=[1], 
    facet_name="Gender", 
    facet_values_or_threshold=[0]) 


In [None]:
clarify_processor.run_bias( 
    data_config=bias_data_config, 
    bias_config=bias_config, 
    model_config=model_config, 
    model_predicted_label_config=predictions_config, 
    pre_training_methods="all", 
    post_training_methods="all") 

### Model Explainability

In [None]:
shap_config = clarify.SHAPConfig( 
    baseline=[churn_features.iloc[0].values.tolist()], 
    num_samples=15, 
    agg_method="mean_abs", 
    save_local_shap_values=True,) 

explainability_data_config = clarify.DataConfig( 
    s3_data_input_path=train_uri, 
    s3_output_path=explainability_output_path, 
    label="Exited", 
    headers=churn_train.columns.to_list(), 
    dataset_type="text/csv") 

In [None]:
clarify_processor.run_explainability( 
    data_config=explainability_data_config, 
    model_config=model_config, 
    explainability_config=shap_config,) 