In [None]:
# Import libraries
import os
from modules.utils import load_object, load_dataset, save_object
from modules.explanations import sample_shap_explanations

In [None]:
# Directory paths
models_dir = '../models'
datasets_dir = '../data/processed/train'
explanations_dir = '../shap-explanations'

# Iterables
dataset_names = os.listdir(models_dir)
model_types = ['logreg', 'dt', 'rf', 'xgb', 'svc']
strategies = ['global', 'sqrt', 'log', 0.1, 0.05]
big_datasets = ['kaggle-credit-card-fraud', 'kaggle-patient-survival',
                'uci-android-permissions', 'uci-phishing-url', 'uci-secondary-mushroom']

In [None]:
# For each dataset, compute explanations
for name in dataset_names:
    print(f'Computing SHAP explanations for {name}')
    explanations_subdir = os.path.join(explanations_dir, name)

    # If the destination directory for explanations does not exist, create it
    if not os.path.exists(explanations_subdir):
        print(f'Creating directory for SHAP explanations at {explanations_subdir}')
        os.makedirs(explanations_subdir)

    # Load the training set for the dataset
    training_set_path = os.path.join(datasets_dir, name.replace('-', '_') + '-train.csv.gz')
    print(f'Loading dataset at {training_set_path}')
    training_set = load_dataset(training_set_path)

    # Loop through the different model types
    for type_ in model_types:
        model_path = os.path.join(models_dir, name, name.replace('-', '_') + f'-{type_}-full.pickle')
        print(f'Loading model at {model_path}')
        model = load_object(model_path)

        # If the dataset is large, we want to test different sampling strategies
        if name in big_datasets:
            print(f'{name} is a big dataset, testing different subsampling strategies')
            for strategy in strategies:
                explanation_path = os.path.join(
                    explanations_subdir, name.replace('-', '_') + f"-{type_}-sampling-{strategy}"
                )
                shap_explanation = sample_shap_explanations(
                    model = model,
                    data = training_set,
                    random_state = 42,
                    strategy = strategy,
                    k = 5,
                    verbose = True
                )
                save_object(shap_explanation, explanation_path, compression = 'lzma')

        # Otherwise, only compute the global SHAP explanations
        else:
            explanation_path = os.path.join(
                    explanations_subdir, name.replace('-', '_') + f"-{type_}-sampling-global"
                )
            shap_explanation = sample_shap_explanations(
                    model = model,
                    data = training_set,
                    verbose = True
                )
            save_object(shap_explanation, explanation_path, compression = 'lzma')