## Load libraries and set up directory paths and iterables

In [None]:
import os

from modules.explanations import aggregate_shap_batches, generate_shap_explanations
from modules.utils import load_dataset, load_object, save_object

In [None]:
# Define directory paths for models, datasets, and SHAP explanations
models_dir = '../models'
datasets_dir = '../data/processed/train'
explanations_dir = '../shap-explanations'

# Define iterables for looping over datasets and model types
dataset_names = sorted(os.listdir(models_dir))
model_types = ['logreg', 'dt', 'rf', 'xgb', 'svc']

## Compute global SHAP values

### Full SHAP calculations

In [None]:
# NOTE: Evaluating SHAP values takes a long time for certain datasets - trained individually on Google Colab
# For each dataset, compute explanations
for name in dataset_names:
    print(f'Computing SHAP explanations for {name}')
    explanations_subdir = os.path.join(explanations_dir, name)

    # If the destination directory for explanations does not exist, create it
    if not os.path.exists(explanations_subdir):
        print(f'Creating directory for SHAP explanations at {explanations_subdir}')
        os.makedirs(explanations_subdir)

    # Load the training set for the dataset
    training_set_path = os.path.join(datasets_dir, name.replace('-', '_') + '-train.csv.gz')
    print(f'Loading dataset at {training_set_path}')
    training_set = load_dataset(training_set_path)

    # Loop through the different model types
    for type_ in model_types:
        model_path = os.path.join(models_dir, name, name.replace('-', '_') + f'-{type_}-full.pickle')
        print(f'Loading model at {model_path}')
        model = load_object(model_path)

        # Compute the global SHAP explanations
        explanation_path = os.path.join(
                explanations_subdir, name.replace('-', '_') + f"-{type_}-sampling-global"
            )
        shap_explanation = generate_shap_explanations(
                model_eval = model.predict,
                data = training_set,
                random_state = 42
            )
        save_object(shap_explanation, explanation_path, compression = 'lzma')

### Batch SHAP version

Datasets and models explained using batching:
- Credit Card Fraud - Random Forest and XGBoost
- Patient Survival - Random Forest and XGBoost
- Android Permissions - Random Forest
- Phishing URL - Random Forest and XGBoost
- Secondary Mushroom - Random Forest and XGBoost

In [None]:
# NOTE: Evaluating SHAP values takes a long time for certain datasets - trained individually on Google Colab

# Ability to resume batches
old_batch_stop = 0
batches = batches[old_batch_stop:]

# For each dataset, compute explanations
for name in dataset_names:
    print(f'Computing SHAP explanations for {name}')
    explanations_subdir = os.path.join(explanations_dir, name)

    # If the destination directory for explanations does not exist, create it
    if not os.path.exists(explanations_subdir):
        print(f'Creating directory for SHAP explanations at {explanations_subdir}')
        os.makedirs(explanations_subdir)

    # Load the training set for the dataset
    training_set_path = os.path.join(datasets_dir, name.replace('-', '_') + '-train.csv.gz')
    print(f'Loading dataset at {training_set_path}')
    training_set = load_dataset(training_set_path)

    # Loop through the different model types
    for type_ in model_types:
        model_path = os.path.join(models_dir, name, name.replace('-', '_') + f'-{type_}-full.pickle.xz')
        print(f'Loading model at {model_path}')
        model = load_object(model_path)

        for i, _ in enumerate(batches[:-1]):
            # Compute the batch SHAP explanations
            print(f'Computing SHAP values for records {batches[i]} - {batches[i + 1]}')
            explanation_path = os.path.join(
                explanations_subdir, name.replace('-', '_') + f"-{type_}-sampling-global-batch{old_batch_stop + i}"
            )
            shap_explanation = generate_shap_explanations(
                model_eval = model.predict,
                data = training_set,
                random_state = 42,
                batch = slice(batches[i], batches[i + 1])
            )
            save_object(shap_explanation, explanation_path, compression = 'lzma')
            
        # Compute the last batch SHAP explanations
        print(f'Computing SHAP values for records {batches[-1]} - {len(training_set)}')
        explanation_path = os.path.join(
            explanations_subdir,
            name.replace('-', '_') + f"-{type_}-sampling-global-batch{len(batches) - 1 + old_batch_stop}"
        )
        shap_explanation = generate_shap_explanations(
            model_eval = model.predict,
            data = training_set,
            random_state = 42,
            batch = slice(batches[-1], None)
        )
        save_object(shap_explanation, explanation_path, compression = 'lzma')

## Aggregate Batches

In [None]:
# Aggregating SHAP explanation batches for all datasets and model types
for name in dataset_names:
    # Iterate over each model type to aggregate its SHAP batches
    for model_type in model_types:
        try:
            # Log the aggregation attempt for traceability
            print(f'Attempted to aggregate SHAP batches for {name} with model {model_type}\n')

            # Aggregate all SHAP batch files for the given dataset and model type
            aggregate_shap_batches(os.path.join(explanations_dir, name), model_type)

            # Confirm successful aggregation
            print(f'\nSuccessfully aggregated SHAP batches for {name} with model {model_type}\n')

        except:
            # Handle missing or inaccessible SHAP batches gracefully
            print('No SHAP batches found\n')