In [9]:
# imports
import pickle
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import StratifiedKFold, train_test_split
import lime
import lime.lime_tabular
import shap
from sklearn.inspection import permutation_importance
from typing import Tuple, Union, Optional, Callable
import matplotlib.pyplot as plt
from tqdm import tqdm
import warnings
from importlib import reload
import constants
import data_utils
import models
import training_and_selection
import plot_accuracies
import explanation_utils

In [5]:
reload(constants)
reload(models)
from constants import (
    SEED,
    NUM_SPLITS,
    INITIAL_CUTOFF,
    TOP,
    TARGET,
    MODEL_ACCURACIES_PATH,
    FILTERED_MODEL_ACCURACIES_PATH,
    TIME_LIMIT,
    TIME_LIMIT_CROSS_VALIDATION,
    RASHOMON_SETS_PATH,
    INITIAL_ACCURACIES_PATH,
    CHECKPOINT_PATH,
)
from models import MODELS, HYPERPARAMETERS

In [6]:
reload(data_utils)
from data_utils import get_dataset

X, y = get_dataset()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=(SEED+1))
BASELINE = np.sum(y == 1) / np.sum(y == 0)

  dataset = pd.read_csv("SpeedDating.csv", index_col=0)


In [7]:
reload(constants)
reload(training_and_selection)
from training_and_selection import get_rashomon_sets

rashomon_sets_params = get_rashomon_sets(
    models=MODELS,
    hyperparameters=HYPERPARAMETERS,
    X=X,
    y=y,
    initial_cutoff=0.15,
    top=0.04,
    initial_time_limit=50,
    cross_validation_time_limit=60,
    initial_path='results/initial_grid_search.csv',
    cross_validation_path='results/cross_validation_results.csv',
)
pickle.dump(rashomon_sets_params, open(RASHOMON_SETS_PATH, 'wb'))

SVMClassifier 321
TabRClassifier 293
XGBClassifier 289
SVMClassifier


720it [00:00, 307212.50it/s]


TabRClassifier


288it [00:00, 629473.45it/s]


XGBClassifier


288it [00:00, 832501.41it/s]
48it [00:00, 541200.52it/s]
43it [00:00, 512372.36it/s]
43it [00:00, 436695.09it/s]


In [None]:
import os
reload(training_and_selection)
reload(explanation_utils)
from typing import Any, Dict
from explanation_utils import EXPLANATION_FUNCS
from models import TabRClassifier
from training_and_selection import release_model_vram


CHECKPOINT_PATH = 'checkpoints'

def get_model(model_class, kwargs, X_train, y_train):
    save_path = f'{CHECKPOINT_PATH}/{model_class.__name__}/{str(kwargs)}.pickle'
    if os.path.exists(save_path):
        with open(save_path, 'rb') as file:
            model = pickle.load(file)
        return model

    if model_class.__name__ == 'SVMClassifier':
        model = model_class(probability=True, **kwargs)
    else:
        model = model_class(**kwargs)
    model.fit(X_train, y_train)
    # if model_class.__name__ == 'TabRClassifier':
    #     model.network = model.network.to('cpu')
    #     model.X_train = model.X_train.to('cpu')
    #     model.y_train = model.y_train.to('cpu')
    #     model.train_indices = model.train_indices.to('cpu')
    #     model.device_name = 'cpu'
    #     torch.cuda.empty_cache()
    if model_class.__name__ != 'TabRClassifier':
        with open(save_path, 'wb') as file:
            pickle.dump(model, file)
    return model




def run_all_explanations(
        models,
        rashomon_sets_params,
        explanation_funcs: Dict[str, Callable],
        X_train,
        y_train,
        X_test,
        y_test,
        plot=False
    ):
    # rashomon_sets = {}
    # rashomon_sets_acc_lower_bounds = {}
    explanations = {name: dict() for name in explanation_funcs.keys()}

    for model_class in models:
        for kwargs in tqdm(rashomon_sets_params[model_class.__name__]):
        # save_path = f'checkpoints/{model_class.__name__}'
        # if not os.path.exists(save_path):
        #     os.mkdir(save_path)

        # rashomon_sets[model_class.__name__] = []
        # accuracies[model_class.__name__] = []
            model = get_model(model_class, kwargs, X_train, y_train)
            model_idx = model_class.__name__, str(kwargs)
            for name, explain_func in explanation_funcs.items():
                expl = explain_func(model, X_test, y_test, plot=plot)
                explanations[name][model_idx] = expl
                # if model_class.__name__ == 'SVMClassifier':
                #     model = model_class(probability=True, **kwargs)
                # else:
                #     model = model_class(**kwargs)
                # model.fit(X_train, y_train)
                # acc = np.mean(model.predict(X_test) == np.array(y_test))
                # accuracies[model_class.__name__].append(acc)
                # if model_class.__name__ == 'TabRClassifier':
                #     del model
                #     gc.collect()
                #     torch.cuda.empty_cache()
                # with open(f'{save_path}/{str(kwargs)}.pickle', 'wb') as file:
                #     pickle.dump(model, file)
                # rashomon_sets[model_class.__name__].append(model)
            release_model_vram(model)
        # rashomon_sets_acc_lower_bounds[model_class.__name__] = min(accuracies[model_class.__name__])
    return explanations

explanations = run_all_explanations(MODELS, rashomon_sets_params, EXPLANATION_FUNCS, X_train, y_train, X_test, y_test, plot=True)

In [None]:
reload(constants)
reload(explanation_utils)
from explanation_utils import run_all_explanations

explanations = run_all_explanations(MODELS[2:], rashomon_sets_params, X_train, y_train, X_test, y_test, plot=True)


In [15]:
from constants import EXPLANATIONS_PATH


with open(EXPLANATIONS_PATH, 'rb') as file:
    explanations = pickle.load(file)

explanations

{'lime': {('XGBClassifier',
   "{'n_estimators': 400, 'max_depth': 10, 'min_child_weight': 4, 'eta': 0.1, 'subsample': 0.8}"): <lime.explanation.Explanation at 0x77e2efe81d30>,
  ('XGBClassifier',
   "{'n_estimators': 400, 'max_depth': 10, 'min_child_weight': 1, 'eta': 0.1, 'subsample': 0.5}"): <lime.explanation.Explanation at 0x77e2eeafdbe0>,
  ('XGBClassifier',
   "{'n_estimators': 600, 'max_depth': 12, 'min_child_weight': 4, 'eta': 0.1, 'subsample': 0.8}"): <lime.explanation.Explanation at 0x77e2f401dc10>},
 'shap': {('XGBClassifier',
   "{'n_estimators': 400, 'max_depth': 10, 'min_child_weight': 4, 'eta': 0.1, 'subsample': 0.8}"): .values =
  array([[-0.02153142,  0.        , -0.01828565,  0.01641031,  0.00485796,
           0.03353878, -0.00486893,  0.00903734,  0.02020107,  0.03410347,
           0.00866633, -0.02444173,  0.01298384,  0.01563851,  0.02166271,
          -0.0069948 ,  0.        ,  0.04478279,  0.02827162, -0.02204132,
           0.0049245 ,  0.01453703,  0.01131401