# Fairness and Explainability with SageMaker Clarify

This notebook takes approximately 30 minutes to run.


## 개요

- [Initialize SageMaker](#initialize-sagemaker)
- [Download data](#download-data)
- [Data Preprocessing](#data-preprocessing)
- [Train XGBoost Model](#train-xgboost-model)
- [Clarify 정의해서 Fairness와 Explainability 분석](#clarify-정의해서-fairness와-explainability-분석)

## Reference

- https://sagemaker-examples.readthedocs.io/en/latest/sagemaker-clarify/fairness_and_explainability/fairness_and_explainability_outputs.html


### Initialize SageMaker


In [None]:
import sys
!{sys.executable} -m pip uninstall -y sagemaker
!{sys.executable} -m pip install --upgrade pip
!{sys.executable} -m pip install --upgrade boto3 --no-cache-dir
!{sys.executable} -m pip install --upgrade sagemaker==2.123.0 --no-cache-dir

In [None]:
from sagemaker import Session
from sagemaker import get_execution_role
import pandas as pd
import numpy as np
import os
import boto3
from datetime import datetime


session = Session()
default_bucket = session.default_bucket()
default_prefix = "sagemaker/fairness_bias_clarify"
region = session.boto_region_name

role = get_execution_role()
s3_client = boto3.client("s3")

In [3]:
print(session)
print(region)
print(role)

<sagemaker.session.Session object at 0x7f04bb91d3c0>
us-east-2
arn:aws:iam::767397847434:role/service-role/AmazonSageMaker-ExecutionRole-20240111T201222


### Download data

Data Source: https://archive.ics.uci.edu/ml/machine-learning-databases/adult/


In [None]:
adult_columns = [
    "Age",
    "Workclass",
    "fnlwgt",
    "Education",
    "Education-Num",
    "Marital Status",
    "Occupation",
    "Relationship",
    "Ethnic group",
    "Sex",
    "Capital Gain",
    "Capital Loss",
    "Hours per week",
    "Country",
    "Target",
]
if not os.path.isfile("adult.data"):
    s3_client.download_file(
        f"sagemaker-example-files-prod-{session.boto_region_name}",
        "datasets/tabular/uci_adult/adult.data",
        "adult.data",
    )
    print("adult.data saved!")
else:
    print("adult.data already on disk.")

if not os.path.isfile("adult.test"):
    s3_client.download_file(
        f"sagemaker-example-files-prod-{session.boto_region_name}",
        "datasets/tabular/uci_adult/adult.test",
        "adult.test",
    )
    print("adult.test saved!")
else:
    print("adult.test already on disk.")

In [5]:
import os
from sagemaker.session import Session
from sagemaker import get_execution_role
from sagemaker.experiments.run import Run


role = get_execution_role()
sagemaker_session = Session()

experiment_name = "fairness-bias-clarify-{}".format(
    datetime.now().strftime("%d-%m-%Y-%H-%M-%S"))

In [6]:
training_data = pd.read_csv(
    "adult.data", names=adult_columns, sep=r"\s*,\s*", engine="python", na_values="?"
).dropna()

testing_data = pd.read_csv(
    "adult.test", names=adult_columns, sep=r"\s*,\s*", engine="python", na_values="?", skiprows=1
).dropna()

training_data.head()

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-Num,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country,Target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


### Data Preprocessing

Here we encode the training and test data.


In [7]:
# Data Preprocessing
# 해당 코드는 sagemaker 사용법에 대한 코드라기보다 data의 preprocessing을 위한 코드이기때문에 별다른 설명 없이 example에 있는 코드를 가져오겠습니다.
from sklearn import preprocessing


def number_encode_features(df):
    result = df.copy()
    encoders = {}
    for column in result.columns:
        if result.dtypes[column] == object:
            encoders[column] = preprocessing.LabelEncoder()
            result[column] = encoders[column].fit_transform(
                result[column].fillna("None"))
    return result, encoders


training_data = pd.concat(
    [training_data["Target"], training_data.drop(["Target"], axis=1)], axis=1)
training_data, _ = number_encode_features(training_data)
training_data.to_csv("train_data.csv", index=False, header=False)

testing_data, _ = number_encode_features(testing_data)
test_features = testing_data.drop(["Target"], axis=1)
test_target = testing_data["Target"]
test_features.to_csv("test_features.csv", index=False, header=False)

In [8]:
from sagemaker.s3 import S3Uploader
from sagemaker.inputs import TrainingInput

train_uri = S3Uploader.upload(
    "train_data.csv", "s3://{}/{}".format(default_bucket, default_prefix))
train_input = TrainingInput(train_uri, content_type="text/csv")
test_uri = S3Uploader.upload(
    "test_features.csv", "s3://{}/{}".format(default_bucket, default_prefix)
)

### Train XGBoost Model


In [10]:
# Estimator 기반의 XGBoost 모델 생성

from sagemaker.image_uris import retrieve
from sagemaker.estimator import Estimator

# Creates an XGBoost estimator based on the provided region.
# region (str): The AWS region to retrieve the XGBoost container for.
container = retrieve("xgboost", region, version="1.2-1")
container  # '257758044811.dkr.ecr.us-east-2.amazonaws.com/sagemaker-xgboost:1.2-1' 이렇게 ECR 에서 xgboost SageMaker 이미지를 가져옴

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


'257758044811.dkr.ecr.us-east-2.amazonaws.com/sagemaker-xgboost:1.2-1'

In [None]:
container = retrieve("xgboost", region, version="1.2-1")

# Estimator 활용
xgb = Estimator(
    container,
    role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    disable_profiler=True,
    sagemaker_session=session,
)

# Hyperparameters 설정
xgb.set_hyperparameters(
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.8,
    objective="binary:logistic",
    num_round=800,
)

with Run(
    experiment_name=experiment_name,
    # create a experiment run with only the model training on it
    run_name="fairness-bias-clarify",
    sagemaker_session=sagemaker_session,
) as run:
    xgb.fit({"train": train_input}, logs=False)

In [None]:
# 학습 완료 후 생성한 모델을 가지고 Clarify 정의해서 Fairness와 Explainability 분석해야 하므로 model을 저장합니다.
model_name = "clarify-bias-model-{}".format(
    datetime.now().strftime("%d-%m-%Y-%H-%M-%S"))
model = xgb.create_model(name=model_name)
container_def = model.prepare_container_def()
session.create_model(model_name, role, container_def)

### Clarify 정의해서 Fairness와 Explainability 분석


In [15]:
# Clarify 정의해서 Fairness와 Explainability 분석
from sagemaker import clarify

clarify_processor = clarify.SageMakerClarifyProcessor(
    role=role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    sagemaker_session=session
)

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: 1.0.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


In [16]:
# report 저장할 경로 지정
bias_report_output_path = "s3://{}/{}/clarify-bias".format(
    default_bucket, default_prefix)

# DataConfig: 어떤 데이터를 써서 Fairness를 판별할지 정의
bias_data_config = clarify.DataConfig(
    s3_data_input_path=train_uri,
    s3_output_path=bias_report_output_path,
    label="Target",
    headers=training_data.columns.to_list(),
    dataset_type="text/csv"
)

In [18]:
# ModelConfig: 어떤 모델의 Fairness를 판단할지 정의
model_config = clarify.ModelConfig(
    model_name=model_name,
    instance_type="ml.m5.xlarge",
    instance_count=1,
    accept_type="text/csv",
    content_type="text/csv",
)

In [19]:
# Prediction을 하는데, 어느 값을 기준으로 probability를 0 or 1로 볼지 정의
predictions_config = clarify.ModelPredictedLabelConfig(
    probability_threshold=0.75)

In [20]:
# BiasConfig: Bias는 pre-training bias(학습 전 데이터만으로 bias를 판별하는 것), post-training bias(training후 모델의 추론 결과를 통해 bias를 측정하는 것) 를 판별할 수 있는데,

bias_config = clarify.BiasConfig(
    label_values_or_threshold=[1],
    facet_name="Sex",
    facet_values_or_threshold=[0],
    group_name="Age"
)

In [None]:
# 위에서 생성한 모델을 바탕으로 bias를 추출합니다.
with Run(
    experiment_name=experiment_name,
    run_name="bias-only",  # create a experiment run with only the bias analysis on it
    sagemaker_session=sagemaker_session,
) as run:
    clarify_processor.run_bias(
        data_config=bias_data_config,
        bias_config=bias_config,
        model_config=model_config,
        model_predicted_label_config=predictions_config,
        pre_training_methods="all",
        post_training_methods="all",
    )

In [22]:
# Explinability
shap_config = clarify.SHAPConfig(
    baseline=[test_features.iloc[0].values.tolist()],
    num_samples=15,
    agg_method="mean_abs",
    save_local_shap_values=True,
)

explainability_output_path = "s3://{}/{}/clarify-explainability".format(
    default_bucket, default_prefix
)
explainability_data_config = clarify.DataConfig(
    s3_data_input_path=train_uri,
    s3_output_path=explainability_output_path,
    label="Target",
    headers=training_data.columns.to_list(),
    dataset_type="text/csv",
)

In [None]:
with Run(
    experiment_name=experiment_name,
    # create a experiment run with only the model explainabilit on it
    run_name="explainabilit-only",
    sagemaker_session=sagemaker_session,
) as run:
    clarify_processor.run_explainability(
        data_config=explainability_data_config,
        model_config=model_config,
        explainability_config=shap_config,
    )