# Welcome to Bellatrex

After making sure that the needed packages are installed, we can dive into the `tutorial.py` code.

## Step 1: import libraries and set parameters

Import the required libraries and set the parameters for the grid search, data folder paths, and other configuration variables

In [96]:
import numpy as np
import os
os.environ["OMP_NUM_THREADS"] = "1" # avoids memory leak UserWarning caused by KMeans
import pandas as pd

# reduce MAX_TEST_SIZE for quick code testing
MAX_TEST_SIZE = 999 # default size is len(X_test), limited to max 100 samples. Set a small number for quicker testing on fewer instances

p_grid = {
    "n_trees": [0.6, 0.8, 1.0],
    "n_dims": [2, 5, None],
    "n_clusters": [1, 2, 3]
    }

##########################################################################
root_folder = os.getcwd()

data_folder = os.path.join(root_folder, "example-data")

''' choose appropriate learning task wth SETUP parameter '''
SETUP = "mtr" # "bin""," "surv", or "mtr" 

VERBOSE = 2

PLOT_GUI = False
'''  levels of verbosity in this script:
    - >= 1.0: print best params, their achieved fidelity,
              and the scoring method used to compute such performance
    - >= 2.0: print final tree idx cluster sizes
              and store txt files with the extracted rule-paths
    - >= 3.0: plot representation of the extracted trees (two plots)
    - >= 4.0: plot trees with GUI (if PLOT_GUI == True)
    - >= 4.0: plot trees without GUI (if PLOT_GUI == False)
    - >= 5.0: print params and performance during GridSearch
'''

# running different RFs or different performance measures according to the 
# prediction scenarios. So far we have implemented the following 5 cases:
binary_key_list = ["bin", "binary"]
survival_key_list = ["surv", "survival"]
multi_label_key_list = ["multi", "multi-l", "multi-label", "mtc"]
regression_key_list = ["regression", "regress", "regr"]
mt_regression_key_list = ["multi-target", "multi-t", "mtr"]

## Step 2: Load and preprocess Data

Load training and testing data from the `.csv` files, split them into features (X) and targets (y), and preprocess the data by formatting the target variables according to the prediction scenarios. Instantiate the appropriate `RandomForest` model.

In [97]:
from code_scripts.utilities import output_X_y

df_train = pd.read_csv(os.path.join(data_folder, SETUP + '_tutorial_train.csv'))
df_test = pd.read_csv(os.path.join(data_folder, SETUP + '_tutorial_test.csv'))

X_train, y_train = output_X_y(df_train, SETUP)
X_test, y_test = output_X_y(df_test, SETUP)

X_train = X_train.drop("Unnamed: 0", axis=1, errors="ignore", inplace=False)
X_test = X_test.drop("Unnamed: 0", axis=1, errors="ignore", inplace=False)

assert X_train.isnull().sum().sum() < 1 #make sure there are no null values
assert X_test.isnull().sum().sum() < 1 #make sure there are no null values

# for quick testing, set a small MAX_TEST_SIZE
X_test = X_test[:MAX_TEST_SIZE]
y_test = y_test[:MAX_TEST_SIZE]

orig_n_labels = y_test.shape[1] #meaningful only in multi-output

In [98]:
from code_scripts.utilities import format_targets

from sksurv.ensemble import RandomSurvivalForest
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

y_train, y_test = format_targets(y_train, y_test, SETUP, VERBOSE)

### instantiate original R(S)F estimator
if SETUP.lower() in survival_key_list:
    clf = RandomSurvivalForest(n_estimators=100, min_samples_split=10,
                              random_state=0)

elif SETUP.lower() in binary_key_list + multi_label_key_list:
    clf = RandomForestClassifier(n_estimators=100, min_samples_split=5,
                                random_state=0)
    
elif SETUP.lower() in regression_key_list + mt_regression_key_list:
    clf = RandomForestRegressor(n_estimators=100, min_samples_split=5,
                               random_state=0)

clf.fit(X_train, y_train)


orig. n* labels: 3
new n* labels: 3


## Step 3: Instantiate and fit the Model

Once the Random Forest is instantiated, the `fit` method in Bellatrex trains the Random Forest and set the parameters for Bellatrex.


In [99]:
from code_scripts.LocalMethod_class import Bellatrex

# fit RF here. The hyperparameters are given      
Bellatrex_fitted = Bellatrex(clf, SETUP,
                            p_grid=p_grid,
                            proj_method="PCA",
                            dissim_method="rules",
                            feature_represent="weighted",
                            n_jobs=1,
                            verbose=1,
                            colormap= 'RdYlBu_r',
                            plot_GUI=PLOT_GUI).fit(X_train, y_train)


Model is already fitted, building explanation.


Storing train data predictions, useful for plotting the output distribution and compare the test data prediction to it.
NOTE that for this example we are storing the .npy array with a generic name of the form SETUP + '_tutorial_y_train_preds.npy'
You might want to name according to the dataset and fold under consideration

In [100]:
y_train_pred = predict_helper(Bellatrex_fitted.clf, X_train)
np.save(os.path.join(os.path.join(data_folder, SETUP + '_tutorial_y_train_preds.npy')), y_train_pred)

(1037, 3)



## Step 4: Make predictions, output explanations

Loop through the test set, make predictions using the Bellatrex local method, and store the results.

In [38]:
from code_scripts.utilities import score_method
# store, for every sample in the test set, the predictions from BELLATREX and the original R(S)F for comparison
N = min(X_test.shape[0], MAX_TEST_SIZE)        

# store final Bellatrex predictions here.
# y_pred will be a (n_samples, n_outputs_) ndarray, or (n_samples,) for single output
y_pred = np.empty((0, 0))
# we harmonise everything with the predict_helper and concatenate_helper functions                
from code_scripts.utilities import predict_helper, concatenate_helper
y_ens_pred = predict_helper(clf, X_test)

for i in range(N): #for every sample in the test set: call the .explain method. 
    # The hyperparameters were given in the .fit. and are actively used and tuned for every instance
    '''
    the .explain outputs:
        - the local prediction 
        - information about the Bellatrex instance: optimal parameters,
                    final extracted trees/rules, their weight in the prediction, etc... 
    
    '''
    FILE_OUT = None
    # uncomment the next 3 lines to store the explanations as .txt files :
    # FILENAME_OUT = "Rules_"+str(folder)+"_f"+str(j)+'_id'+str(i)+'.txt'
    # os.path.join(root_folder, 'example-explanations',
    #                                 Btrex_fitted.set_up, FILENAME_OUT)

    y_local_pred, sample_info = Bellatrex_fitted.explain(X_test, i, FILE_OUT) # tuning is also done within the .explain method
    
    # append all test sample predictions in y_pred
    y_pred = concatenate_helper(y_pred, y_local_pred, axis=0)


best params: {'n_clusters': 3, 'n_dims': 2, 'n_trees': 100}
Achieved fidelity: 0.9253
(Tuned according to L2)
Bellatrex prediction: 0.470
Black box prediction: 0.545
######################################################
best params: {'n_clusters': 1, 'n_dims': 2, 'n_trees': 60}
Achieved fidelity: 0.8660
(Tuned according to L2)
Bellatrex prediction: 0.000
Black box prediction: 0.134
######################################################
best params: {'n_clusters': 1, 'n_dims': 2, 'n_trees': 60}
Achieved fidelity: 0.9925
(Tuned according to L2)
Bellatrex prediction: 0.000
Black box prediction: 0.007
######################################################
best params: {'n_clusters': 2, 'n_dims': 2, 'n_trees': 80}
Achieved fidelity: 0.9968
(Tuned according to L2)
Bellatrex prediction: 0.081
Black box prediction: 0.084
######################################################
best params: {'n_clusters': 1, 'n_dims': 2, 'n_trees': 60}
Achieved fidelity: 1.0000
(Tuned according to L2)
Bellatrex 

Predictions form Bellatrex and the corresponding black-box model are stored. here we compare the performance on (a small subset of) the test set.

In [32]:
#quick testing with few samples (less than 100)
y_test = y_test[:N]
y_ens_pred = y_ens_pred[:N]

for key, value in score_method(y_test, y_pred, SETUP).items():
    print(f'Bellatrex performance: {key}: {value:.4f}')

for key, value in score_method(y_test, y_ens_pred, SETUP).items():
    print(f'Black-box performance: {key}: {value:.4f}')

Bellatrex performance: AUROC: 0.6328
Black-box performance: AUROC: 0.7344


In [33]:
print(y_ens_pred)
print(y_pred)

[0.5446627  0.13403571 0.0075     0.08445362 0.         0.16896032
 0.18791667 0.08445362 0.         0.08733333 0.044      0.20542857
 0.03683333 0.04052381 0.20207143 0.27615526 0.11330952 0.11646429
 0.22027778 0.03166667]
[0.47       0.         0.         0.08125    0.         0.17022536
 0.125      0.08125    0.         0.16666667 0.04833333 0.09583333
 0.         0.         0.16666667 0.27166667 0.         0.11666667
 0.20625    0.        ]
