In [1]:
!pip show numpy

Name: numpy
Version: 1.20.0
Summary: NumPy is the fundamental package for array computing with Python.
Home-page: https://www.numpy.org
Author: Travis E. Oliphant et al.
Author-email: 
License: BSD
Location: /home/mythreyi/.local/lib/python3.9/site-packages
Requires: 
Required-by: acv-exp, altair, anchor-exp, blis, catboost, h5py, hyperopt, imageio, imodels, Keras-Preprocessing, lightgbm, lime, matplotlib, mlxtend, numba, opt-einsum, pandas, pyagrum, pyarrow, pydeck, PyWavelets, scikit-image, scikit-learn, scipy, seaborn, shap, spacy, streamlit, tensorboard, tensorflow, thinc, tifffile, xgboost


In [2]:
import pandas as pd
import numpy as np

from sklearn.metrics import f1_score, classification_report, roc_auc_score, r2_score, mean_absolute_percentage_error
from sklearn.model_selection import KFold, cross_val_score
import sklearn

import sys
import os
import joblib

import warnings
warnings.filterwarnings('ignore')

from acv_explainers import ACXplainer

import random

from tqdm import tqdm_notebook

from hyperopt import fmin, tpe, hp, Trials, rand, early_stop
from hyperopt.pyll import scope

from DatasetManager import DatasetManager
import BucketFactory

In [3]:
# path to project folder
# please change to your own
PATH = os.getcwd()

dataset = "bpic2011"
bucket_method = "prefix"
encoding = "index"
cls_method = "nb"

method_name = bucket_method+"_"+encoding

random_state = 22
exp_iter = 10

In [4]:
method_folder = os.path.join(PATH, dataset, cls_method, method_name)
dataset_folder = os.path.join(PATH, dataset, "datasets")

print(method_folder)
print(dataset_folder)

/home/mythreyi/full_stability/bpic2011/nb/prefix_index
/home/mythreyi/full_stability/bpic2011/datasets


In [5]:
dataset_ref_to_datasets = {
    "bpic2012" : ["bpic2012_accepted"],
    "sepsis_cases": ["sepsis_cases_1"],
    "production" : ["production"],
    "bpic2011" : ["bpic2011_f1"],
    "hospital" : ["hospital_billing_2"],
    "traffic" : ["traffic_fines_1"]
}

datasets = [dataset] if dataset not in dataset_ref_to_datasets else dataset_ref_to_datasets[dataset]

num_buckets = len([name for name in os.listdir(os.path.join(PATH,'%s/%s/%s/pipelines'% (dataset, cls_method, method_name)))])

# for dataset_name in datasets:
#     dataset_manager = DatasetManager(dataset_name)
    
#     min_prefix_length = 1
#     max_prefix_length = num_buckets

#     dt_train_prefixes = pd.read_csv(os.path.join(dataset_folder, "train_prefixes.csv"))
#     dt_train_prefixes = dataset_manager.generate_prefix_data(dt_train_prefixes, min_prefix_length, max_prefix_length)

#     dt_val_prefixes = pd.read_csv(os.path.join(dataset_folder, "val_prefixes.csv"))
#     dt_val_prefixes = dataset_manager.generate_prefix_data(dt_val_prefixes, min_prefix_length, max_prefix_length)
    
#     dt_test_prefixes = pd.read_csv(os.path.join(dataset_folder, "test_prefixes.csv"))
#     dt_test_prefixes = dataset_manager.generate_prefix_data(dt_test_prefixes, min_prefix_length, max_prefix_length)
    
#     if bucket_method == "state":
#         bucket_encoding = "last"
#     else:
#         bucket_encoding = "agg"
    
#     bucketer_args = {'encoding_method':bucket_encoding,
#                      'case_id_col':dataset_manager.case_id_col, 
#                      'cat_cols':[dataset_manager.activity_col], 
#                      'num_cols':[], 
#                      'random_state':random_state}
#     bucketer = BucketFactory.get_bucketer(bucket_method, **bucketer_args)

#     bucket_assignments_train = bucketer.fit_predict(dt_train_prefixes)
#     bucket_assignments_val = bucketer.predict(dt_val_prefixes)
#     bucket_assignments_test = bucketer.predict(dt_test_prefixes)

In [6]:
for bucket in tqdm_notebook(range(num_buckets)):
    bucketID = bucket+1
    print ('Bucket', bucketID)

    #import everything needed to sort and predict
    print("importing ddata and models...")
    pipeline_path = os.path.join(method_folder, "pipelines/pipeline_bucket_%s.joblib" % 
                                 (bucketID))
    pipeline = joblib.load(pipeline_path)
    feature_combiner = pipeline['encoder']
    if 'scaler' in pipeline.named_steps:
        scaler = pipeline['scaler']
    else:
        scaler = None
    cls = pipeline['cls']
    
    X_train = pd.read_csv(os.path.join(method_folder, "train_data", "train_data_bucket_%s.csv" % (bucketID)))
    if scaler!= None:
        X_train = scaler.transform(X_train)
    Y_train = pd.read_csv(os.path.join(method_folder, "train_data", "y_train_bucket_%s.csv" % (bucketID)))
    
    test_x = pd.read_csv(os.path.join(method_folder, "samples", "test_sample_bucket_%s.csv" % (bucketID)))
    if scaler!=None:
        test_x = scaler.transform(test_x)
    #print(feature_combiner, scaler, cls)
    
#     if scaler!=None:
#         X_train = scaler.transform(train_data)

#     relevant_train_cases_bucket = dataset_manager.get_indexes(dt_train_prefixes)[bucket_assignments_train == bucketID]
#     dt_train_bucket = dataset_manager.get_relevant_data_by_indexes(dt_train_prefixes, relevant_train_cases_bucket)

#     X_train = feature_combiner.transform(dt_train_bucket)
#     if scaler!=None:
#         X_train = scaler.transform(X_train)
        
#     relevant_val_cases_bucket = dataset_manager.get_indexes(dt_val_prefixes)[bucket_assignments_val == bucketID]
#     dt_val_bucket = dataset_manager.get_relevant_data_by_indexes(dt_val_prefixes, relevant_val_cases_bucket)

#     X_val = feature_combiner.transform(dt_val_bucket)
#     if scaler!=None:
#         X_val = scaler.transform(X_val)
    
#     relevant_test_cases_bucket = dataset_manager.get_indexes(dt_test_prefixes)[bucket_assignments_test == bucketID]
#     dt_test_bucket = dataset_manager.get_relevant_data_by_indexes(dt_test_prefixes, relevant_test_cases_bucket)

#     test_x = feature_combiner.transform(dt_test_bucket)
#     if scaler!=None:
#         test_x = scaler.transform(test_x)
    
    print("getting og model predictions...")
    Y_pred = cls.predict(X_train)
    test_pred = cls.predict(test_x)
    
#     full_train_x = np.vstack((X_train, X_val))
#     full_train_y = np.hstack((Y_pred, Y_val))
    
    #Set up hyperparameter optimisation
    print("setting up hyperparameters...")
    kf = KFold(n_splits=5, shuffle = True, random_state=random_state)

    space = {"n_estimators": scope.int(hp.quniform('n_estimators', 1, 20, q=1)),
            "max_depth": scope.int(hp.quniform('max_depth', 1, 20, q=1)),
            "sample_fraction": (hp.quniform('sample_fraction', 0.0001, 1, q=0.4))}

    trials = Trials()
    
    def acv_classifier_optimisation(args, random_state = random_state, cv = kf, X = X_train, y = Y_pred):
        score = []
        iteration = 0
        for train_index, test_index in kf.split(X):
#             iteration += 1
#             print("Testing fold no.", iteration)
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            estimator = ACXplainer(classifier = True, n_estimators = args["n_estimators"], 
                                   max_depth = args['max_depth'], sample_fraction = args["sample_fraction"])
            estimator.fit(X_train, y_train)

            score.append(f1_score(y_test, estimator.predict(X_test)))
        
        score = np.mean(score)

        return -score
    
    print("testing hyperparameters")
    best = fmin(acv_classifier_optimisation, verbose=1, space = space, algo=rand.suggest, max_evals = 50, trials=trials, 
                rstate=np.random.default_rng(random_state), early_stop_fn=early_stop.no_progress_loss(3))
    print("training surrogate model")
    explainer = ACXplainer(classifier = True, n_estimators = int(best['n_estimators']), 
                           max_depth = int(best['max_depth']), sample_fraction = best['sample_fraction'])
    explainer.fit(X_train, Y_pred)
    
    print("Training Score:", f1_score(cls.predict(X_train), explainer.predict(X_train)))
    print("Testing Score:", f1_score(cls.predict(test_x), explainer.predict(test_x)))
    
    joblib.dump(explainer, method_folder+"/acv_surrogate/acv_explainer_bucket_%s.joblib"%(bucketID))

  0%|          | 0/14 [00:00<?, ?it/s]

Bucket 1
importing ddata and models...
getting og model predictions...
setting up hyperparameters...
testing hyperparameters

  0%|                                    | 0/50 [00:00<?, ?trial/s, best loss=?][A
  2%|▏        | 1/50 [00:01<01:02,  1.27s/trial, best loss: -0.6827121599197364][A
  4%|▎        | 2/50 [00:03<01:18,  1.63s/trial, best loss: -0.7271385131622157][A
  6%|▌        | 3/50 [00:04<01:08,  1.45s/trial, best loss: -0.7271385131622157][A
  8%|▋        | 4/50 [00:05<01:03,  1.38s/trial, best loss: -0.7271385131622157][A
 10%|▉        | 5/50 [00:07<01:03,  1.42s/trial, best loss: -0.7271385131622157][A
training surrogate model
Training Score: 0.8012232415902141
Testing Score: 0.6
Bucket 2
importing ddata and models...
getting og model predictions...
setting up hyperparameters...
testing hyperparameters

  0%|                                    | 0/50 [00:00<?, ?trial/s, best loss=?][A
  2%|▏        | 1/50 [00:01<01:04,  1.32s/trial, best loss: -0.7066421264250058]

  0%|                                    | 0/50 [00:00<?, ?trial/s, best loss=?][A
  2%|▏        | 1/50 [00:01<01:16,  1.55s/trial, best loss: -0.7727021120671352][A
  4%|▎        | 2/50 [00:03<01:25,  1.79s/trial, best loss: -0.7888187545886133][A
  6%|▌        | 3/50 [00:04<01:15,  1.61s/trial, best loss: -0.7888187545886133][A
  8%|▋        | 4/50 [00:06<01:08,  1.49s/trial, best loss: -0.7888187545886133][A
 10%|▉        | 5/50 [00:07<01:09,  1.54s/trial, best loss: -0.7888187545886133][A
training surrogate model
Training Score: 0.8851351351351351
Testing Score: 0.8333333333333333
Bucket 13
importing ddata and models...
getting og model predictions...
setting up hyperparameters...
testing hyperparameters

  0%|                                    | 0/50 [00:00<?, ?trial/s, best loss=?][A
  2%|▏        | 1/50 [00:01<01:22,  1.69s/trial, best loss: -0.7504847608334673][A
  4%|▎        | 2/50 [00:03<01:21,  1.70s/trial, best loss: -0.7760899093529622][A
  6%|▌        | 3/50 [0

In [7]:
vars(explainer)

{'classifier': 1,
 'n_estimators': 18,
 'verbose': False,
 'mtry': 0,
 'importance': 'impurity',
 'min_node_size': 0,
 'max_depth': 12,
 'replace': True,
 'sample_fraction': 0.4,
 'keep_inbag': False,
 'inbag': None,
 'split_rule': 'gini',
 'num_random_splits': 1,
 'check_is_explain': False,
 'ACXplainer': None,
 'seed': 2021,
 'rules': None,
 'rules_output': None,
 'rules_s_star': None,
 'rules_coverage': None,
 'rules_acc': None,
 'rules_var': None,
 'd': 2245,
 'check_is_globalrule': False,
 'rules_output_proba': None,
 'rules_ori': None,
 'rules_s_star_ori': None,
 'model': RangerForestClassifier(enable_tree_details=True, importance='impurity',
                        max_depth=12, n_estimators=18, sample_fraction=0.4,
                        seed=2021)}

In [8]:
# explainer = joblib.load("bpic2012/nb/single_agg/acv_surrogate/acv_explainer_bucket_1.joblib")
# vars(explainer)

In [9]:
# relevant_test_cases_bucket = dataset_manager.get_indexes(dt_test_prefixes)[bucket_assignments_test == 16]
# dt_test_bucket = dataset_manager.get_relevant_data_by_indexes(dt_test_prefixes, relevant_test_cases_bucket)

# test_x = feature_combiner.transform(dt_test_bucket)
# if scaler!=None:
#     test_x = scaler.transform(test_x)
test_x.shape

(8, 2245)

In [10]:
sklearn.metrics.confusion_matrix(explainer.predict(test_x), cls.predict(test_x))

array([[1, 4],
       [0, 3]])

In [11]:
sklearn.metrics.confusion_matrix(explainer.predict(full_train_x), cls.predict(full_train_x))

NameError: name 'full_train_x' is not defined

In [None]:
true = np.hstack((dataset_manager.get_label_numeric(dt_train_bucket), dataset_manager.get_label_numeric(dt_val_bucket)))
sklearn.metrics.confusion_matrix(true, cls.predict(full_train_x))
f1_score(true, cls.predict(full_train_x))

In [None]:
sklearn.metrics.confusion_matrix(dataset_manager.get_label_numeric(dt_test_bucket), cls.predict(test_x))
f1_score(dataset_manager.get_label_numeric(dt_test_bucket), cls.predict(test_x))