In [2]:
from xgboost import XGBClassifier
import warnings
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from tabpfn_new.scripts.transformer_prediction_interface import TabPFNClassifier, MedPFNClassifier
from tabpfn_new.scripts.model_builder import load_model
import numpy as np
import pandas as pd
import os
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from data_prep_utils import *
from evaluate import *
#from load_models import *
import matplotlib.pyplot as plt
import torch
import openml
import time
#pd.set_option('display.expand_frame_repr', False)

In [3]:
path = "datasets/data_all.csv"
all_data, labels = get_microbiome(path)
all_data = remove_zero_features(all_data)
all_data, labels = unison_shuffled_copies(all_data, labels, seed=42)

### Feature importance - Leave-one-covariate-out (LOCO)

In [5]:
save = True
sampling = None
cv = 3
best_delete = 0
strat_split = True
max_samples = 1024
no_pre_process = False
multi_decoder = "permutation"
N_ens = 3
seed = 42
overwrite = True
metrics = ["roc_auc", "f1"]
dir_path = os.path.abspath(os.getcwd())
run_name = "medium_mlp_var_balance_05weight_anova"
path = dir_path + f"/logs/trainrun_{run_name}"
filename = "model"

model = MedPFNClassifier(base_path=path, filename=filename, device='cpu', N_ensemble_configurations=N_ens, multiclass_decoder=multi_decoder,  no_preprocess_mode=no_pre_process)

reducer = AnovaSelect()

results, _ = cross_validate_sample(model, all_data, labels, metrics, strat_split, cv, sampling, 
                                   reducer, max_samples, seed=seed, overwrite=overwrite, n_best_delete=best_delete)

rocs = []
f1s = []
for f in range(all_data.shape[1]):
    if f%100==0:
        print("Currently at feature: ", f)
    loco_data = np.delete(all_data,f,axis=1)
    loco_results, _ = cross_validate_sample(model, loco_data, labels, metrics, strat_split, cv, sampling, 
                                   reducer, max_samples, seed=seed, overwrite=overwrite, n_best_delete=best_delete)
    model.pred_model.model[2].zero_grad()
    rocs.append(loco_results[0])
    f1s.append(loco_results[1])
rocs = results[0]-np.array(rocs)
f1s = results[1]-np.array(f1s)
red_name ="fi-loco"
directory = f"results/{red_name}"
if not os.path.exists(directory):
    os.makedirs(directory)
np.save(f"results/{red_name}/rocs.npy", rocs)
np.save(f"results/{red_name}/f1s.npy", f1s)

Currently at feature:  0
Currently at feature:  100
Currently at feature:  200
Currently at feature:  300
Currently at feature:  400
Currently at feature:  500
Currently at feature:  600



KeyboardInterrupt



In [None]:
plt.hist(results[0]-np.array(rocs))
plt.show()
plt.scatter(np.arange(len(rocs)), results[0]-np.array(rocs))

### Feature effect - Individual conditional expectation (ICE)

In [None]:
features = [0]
step = 0.1
num = 1
c1_props_list = []
for f in features:
    fvalues = np.linspace(np.min(all_data[:,f]),np.max(all_data[:,f]),num=num)
    for v in fvalues:
        X_train, X_test, y_train, y_test = train_test_split(all_data, labels, test_size=0.3, stratify=labels, random_state=42)
        X_train, X_test = remove_same_features_traintest(X_train, X_test)
        X_train[:,f] = 0
        X_test[:,f] = 0
        X_train = X_train*(1/np.sum(X_train,axis=1, keepdims=True))*(1-v)
        X_test = X_test*(1/np.sum(X_test,axis=1, keepdims=True))*(1-v)
        X_train[:,f] = v
        X_test[:,f] = v
        reducer.fit(X_train, y_train)
        X_train, X_test = reducer.transform(X_train), reducer.transform(X_test)
        model.fit(X_train, y_train, overwrite_warning=True)
        preds = model.predict_proba(X_test)
        c1_probs = (preds[:,1]-preds[:,0]+1)*0.5
        c1_props_list.append(c1_probs)
        ice_curves = np.array(c1_props_list)
        red_name ="ice"
        directory = f"results/{red_name}"
        if not os.path.exists(directory):
            os.makedirs(directory)
        np.save(f"results/ice/feature{f}.npy", ice_curves)

In [6]:
print(np.array(c1_props_list).shape)

(0,)


In [None]:
step = 0.1
fvalues = np.arange(0,1+step,step)
ice_curves = np.load("results/ice/test.npy")
for i in range(100):
    plt.plot(fvalues, ice_curves[:,i])