# Welcome to Bellatrex

After making sure that the needed packages are installed, we can dive into the `tutorial.py` code.

## Step 1: import libraries and set parameters

Import the required libraries and set the parameters for the grid search, data folder paths, and other configuration variables

In [None]:
import numpy as np
import os
os.environ["OMP_NUM_THREADS"] = "1" # avoids memory leak UserWarning caused by KMeans
import pandas as pd

from sksurv.ensemble import RandomSurvivalForest
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

from utilities import score_method, output_X_y
from utilities import format_targets, format_RF_preds
#from plot_tree_patch import plot_tree_patched

from LocalMethod_class import Bellatrex

# reduce MAX_TEST_SIZE for quick code testing
MAX_TEST_SIZE = 10 #if set >= 100, it takes the (original) value X_test.shape[0]

p_grid = {
    "n_trees": [0.2, 0.5, 0.8],
    "n_dims": [2, 5, None],
    "n_clusters": [1, 2, 3]
    }

##########################################################################
root_folder = os.getcwd()

data_folder = os.path.join(root_folder, "datasets")

''' choose appropriate learning task wth SETUP parameter '''
SETUP = "mtr" # "bin", or "mtr" 

VERBOSE = 3

PLOT_GUI = False
'''  levels of verbosity in this script:
    - >= 1.0: print best params, their achieved fidelity,
              and the scoring method used to compute such performance
    - >= 2.0 print final tree idx cluster sizes
    - >= 3.0: plot representation of the extracted trees (two plots)
    - >= 4.0 plot trees with GUI (if PLOT_GUI == True)
    - >= 4.0 plot trees without GUI (if PLOT_GUI == False)
    - >= 5.0: print params and performance during GridSearch
'''

# running different RFs or different performance measures according to the 
# prediction scenarios. So far we have implemented the following 5 cases:
binary_key_list = ["bin", "binary"]
survival_key_list = ["surv", "survival"]
multi_label_key_list = ["multi", "multi-l", "multi-label", "mtc"]
regression_key_list = ["regression", "regress", "regr"]
mt_regression_key_list = ["multi-target", "multi-t", "mtr"]

## Step 2: Load and preprocess Data

Load training and testing data from the `.csv` files, split them into features (X) and targets (y), and preprocess the data by formatting the target variables according to the prediction scenarios. Instantiate the appropriate `RandomForest` model.

In [None]:
df_train = pd.read_csv(os.path.join(data_folder, SETUP + '_tutorial_train.csv'))
df_test = pd.read_csv(os.path.join(data_folder, SETUP + '_tutorial_test.csv'))

X_train, y_train = output_X_y(df_train, SETUP)
X_test, y_test = output_X_y(df_test, SETUP)

X_train = X_train.drop("Unnamed: 0", axis=1, errors="ignore", inplace=False)
X_test = X_test.drop("Unnamed: 0", axis=1, errors="ignore", inplace=False)

assert X_train.isnull().sum().sum() < 1 #make sure there are no null values
assert X_test.isnull().sum().sum() < 1 #make sure there are no null values

# for quick testing, set a small MAX_TEST_SIZE
X_test = X_test[:MAX_TEST_SIZE]
y_test = y_test[:MAX_TEST_SIZE]

orig_n_labels = y_test.shape[1] #meaningful only in multi-output

Set target variable to correct format depending on the prediciton scenarios.E.g. set np.recarray fo survival data, or normalise data in case of single and multi-target regression

In [None]:
y_train, y_test = format_targets(y_train, y_test, SETUP, VERBOSE)


### instantiate original R(S)F estimator
if SETUP.lower() in survival_key_list:
    rf = RandomSurvivalForest(n_estimators=100, min_samples_split=10,
                              random_state=0)

elif SETUP.lower() in binary_key_list + multi_label_key_list:
    rf = RandomForestClassifier(n_estimators=100, min_samples_split=5,
                                random_state=0)
    
elif SETUP.lower() in regression_key_list + mt_regression_key_list:
    rf = RandomForestRegressor(n_estimators=100, min_samples_split=5,
                               random_state=0)

## Step 3: Instantiate and fit the Model

Once the Random Forest is instantiated, the `fit` method in Bellatrex trains the Random Forest and set the parameters for Bellatrex.


In [None]:
# fit RF here. The hyperparameters are given      
Bellatrex_fitted = Bellatrex(rf, SETUP,
                            p_grid=p_grid,
                            proj_method="PCA",
                            dissim_method="rules",
                            feature_represent="weighted",
                            n_jobs=1,
                            verbose=VERBOSE,
                            plot_GUI=PLOT_GUI).fit(X_train, y_train)


# store, for every sample in the test set, the predictions from the
# local method and the original R(S)F
N = min(X_test.shape[0], MAX_TEST_SIZE)        
y_pred = []

stored_info = [] #store extra info such as optimal hyperparameters (for each instance)


## Step 4: Make predictions, output explanations

Loop through the test set, make predictions using the Bellatrex local method, and store the results.

In [None]:
for i in range(N): #for every sample in the test set: call .predict
          
    # call the .predict method. The hyperparamters were given in the .fit.
    # Now they are actively used and tuned for every instance
    '''
    the .predict ouputs:
        - the local prediction 
        - information about the Bellatrex instance: optimal parameters,
                    final extracted trees/rules, their weight in the prediction, etc... 
    
    '''
    y_local_pred, sample_info = Bellatrex_fitted.predict(X_test, i)  #tuning is also done within the .predict method
    
    # append all test sample predictions in y_pred
    y_pred.append(y_local_pred) # store for debuggind and analysis
    
        
y_ens_pred = format_RF_preds(rf, X_test, SETUP)

# adapt to numpy array (N, L) where N = samples, L = labels (if L>1)
#y_ens_pred = np.transpose(np.array(y_ens_pred)[:,:,1])
if SETUP.lower() not in multi_label_key_list + mt_regression_key_list :
    y_pred = np.array(y_pred).ravel() #force same array format
          
# sometimes the y_pred is not an array as expected, here we gauranatee it is
y_pred = np.array(y_pred)

#in case of quick testing with few samples (less than 100)
y_test = y_test[:MAX_TEST_SIZE]
y_ens_pred = y_ens_pred[:MAX_TEST_SIZE]