In [8]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import shap
from sklearn.datasets import make_classification,make_regression
from sklearn.model_selection import train_test_split
import sys
import time

def benchmark_dict_print(output_dict):
    for key in output_dict:
        #samples
        print(key+": ")
        for key_inner in output_dict[key]:
            #features
            print(key_inner+": ")
            
            #per informative
            time_str = ""
            
            i = 0
            for avg_time in output_dict[key][key_inner]["Average time"]:
                
                i = i+1
                
                if i ==len(output_dict[key][key_inner]["Average time"]):
                    time_str = time_str + str(avg_time)+"s | "
                else:
                    time_str = time_str + str(avg_time)+"s - "
                  
            key_strs = ""
            for key_inner_inner in output_dict[key][key_inner]: 
                if key_inner_inner != 'Average time':
                    key_strs = key_strs + str(output_dict[key][key_inner][key_inner_inner]["found_informative_features"]) + " ("+str(output_dict[key][key_inner][key_inner_inner]["outputted_noise_features"])+") / "+str(output_dict[key][key_inner][key_inner_inner]["informative_features"]) + " | "
                
            print(time_str+key_strs)
                 

def output_dict_to_df(output_dict):
    df = pd.DataFrame(columns=["n_samples","total_features","informative_features","time","seed","found_informative_features","outputted_noise_features"])
    for samples in output_dict.keys():
        for features in output_dict[samples].keys():
            i = 0
            for informative in output_dict[samples][features].keys():
                if informative != 'Average time':
                    temp = pd.DataFrame.from_dict(output_dict[samples][features][informative])
                    temp["n_samples"]=int(samples)
                    temp["total_features"]=int(features)
                    temp = temp.reset_index()
                    temp = temp.rename(columns={"index":"seed"})
                    temp["time"]=output_dict[samples][features]['Average time'][i]
                    i = i+1
                    df = df.append(temp) 

    return df

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Simple testing

## Simulation dataset creation

In [4]:
n_features = 500 #20,50,100,250,500
n_informative = int(0.90*n_features) #5%,10%,33%,50%,90%
n_samples = int(5000/(1-0.33))+1 #7463#5000

X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=n_features, n_informative=n_informative, n_redundant=0, n_repeated = 0,shuffle=False)
#X, y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_informative,random_state=4,shuffle=False)
X = pd.DataFrame(data=X, columns=[f"col_{i}" for i in range(n_features)])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True, random_state=42)
X_train

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_490,col_491,col_492,col_493,col_494,col_495,col_496,col_497,col_498,col_499
4529,16.745219,3.064825,-2.863545,-13.510380,5.315558,8.874887,-0.422799,-11.292289,-7.055259,-8.252094,...,1.188208,-0.378725,-0.397908,-0.136526,-0.958061,-0.630357,-0.159185,-0.620803,-1.010112,-1.465302
2897,10.602683,6.263638,5.030698,1.275159,-4.274334,5.887773,7.712275,2.122422,12.012534,-12.924060,...,-0.317056,2.696913,-0.828865,0.567702,0.743005,0.574161,-0.437067,0.293724,1.449087,-1.036390
6361,-7.341757,-2.613107,-14.257762,10.934357,17.126862,-17.145643,-11.866992,-9.229942,12.799775,-5.462458,...,0.365749,-0.317567,-1.250173,-0.201837,0.515585,1.730105,0.381383,0.636158,-1.585005,-0.477964
1654,15.103830,-19.380835,11.973576,-8.294274,-10.290333,19.368051,-8.909064,-14.724182,8.982827,-18.175523,...,-0.938948,0.628654,0.792452,0.354764,0.886241,-0.988053,-1.464783,1.549964,-1.125569,-0.285697
6111,10.456393,-18.788397,-9.642759,-4.692734,6.591294,-21.168312,7.804272,4.631699,-23.114104,-4.831323,...,-0.741045,-0.863689,-0.039028,-0.456419,-2.576476,0.189589,0.778050,-0.611166,0.340887,-0.485027
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5191,-21.483905,0.999483,-4.419907,-1.641034,-21.870400,-9.777687,-8.822450,-14.516811,8.961772,-2.359504,...,-1.200718,1.372257,1.042186,-0.230238,1.708837,-0.161753,1.598509,0.781025,0.193675,-0.694817
5226,-11.222781,9.525985,3.708534,-15.338396,2.099114,11.809690,0.452612,16.381729,-22.193395,-6.377645,...,0.387828,1.204048,1.135725,1.584429,1.094804,0.770361,0.520100,-0.526397,-0.241824,1.553872
5390,-12.638624,0.459624,-6.357631,-7.727807,3.518887,-34.666695,14.279390,23.694937,7.549388,12.884744,...,1.008474,0.574237,-1.141540,-0.756666,0.117051,-1.494126,-1.153968,-0.772677,-0.527955,-0.477227
860,19.233188,-15.913120,-7.948918,-8.218713,-21.708159,-13.608937,19.088807,0.907969,12.777942,-0.773037,...,0.019563,0.030539,-0.615041,1.538401,-0.869927,1.483099,0.370104,0.253871,0.075206,-0.632390


## Single powershap test

In [None]:
import sys
sys.path.append("../powershap")

from powershap import PowerSHAP


from catboost import CatBoostClassifier,CatBoostRegressor
from sklearn.linear_model import LogisticRegressionCV, RidgeClassifierCV,LinearRegression
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier

selector = PowerSHAP(
    model = CatBoostClassifier(verbose=0, n_estimators=250,use_best_model=True),#LogisticRegressionCV(),#GradientBoostingClassifier(),#CatBoostClassifier(verbose=0, n_estimators=250),
    #model = CatBoostRegressor(verbose=0, n_estimators=0,use_best_model=True),
    verbose=True,
)
selector.fit(X_train, y_train)

In [None]:
selector.transform(X_test)

## Powershap sklearn pipeline test

In [None]:
from sklearn.pipeline import Pipeline

from sklearn.neighbors import KNeighborsClassifier

pipe = Pipeline(
    [
        (
            "selector",
            PowerSHAP(
                CatBoostClassifier(n_estimators=250,verbose=False,use_best_model=True), automatic=True, limit_automatic=100,#power_alpha=0.001,power_req_iterations=0.999,
                #CatBoostRegressor(n_estimators=250,verbose=False), automatic=True, limit_automatic=100,
            ),
        ),
        ("catboost", KNeighborsClassifier()),#(n_estimators=250,verbose=False)),
        #("catboost", CatBoostRegressor(n_estimators=250,verbose=False)),
    ]
)

pipe.fit(X_train, y_train)


from sklearn.metrics import accuracy_score,r2_score


print("Baseline", accuracy_score(KNeighborsClassifier().fit(X_train, y_train).predict(X_test), y_test))
#print("Baseline", r2_score(LinearRegression.fit(X_train, y_train).predict(X_test), y_test))


print("PowerShap feature selection:", accuracy_score(pipe.predict(X_test), y_test))
#print("PowerShap feature selection:", r2_score(pipe.predict(X_test), y_test))



In [None]:
print("Baseline", accuracy_score(CatBoostClassifier(verbose=False,n_estimators=250).fit(X_train, y_train).predict(X_test), y_test))

In [None]:
processed_shaps_df = pipe[0]._processed_shaps_df

In [None]:
len(processed_shaps_df[processed_shaps_df.p_value<0.01])

# benchmarking

## Estimators = 50 Classification

In [None]:
from sklearn.datasets import make_classification,make_regression
from sklearn.model_selection import train_test_split
import sys
import time
import pprint
#sys.path.append("../powershap")

from powershap import PowerSHAP
from catboost import CatBoostClassifier,CatBoostRegressor

#n_features = 20 #20,50,100,250,500
#n_informative = int(0.10*n_features) #10%,33%,50%,90%
#n_samples = 1000#5000

regression_bool=False
estimators = 50#250

output_dict = {}

for n_samples in [1000,5000,20000]:
    output_dict[str(n_samples)]={}
    for n_features in [20,100,250,500]:
        output_dict[str(n_samples)][str(n_features)]={}
        
        average_times = []
        output_dict[str(n_samples)][str(n_features)]["Average time"]=average_times
        for n_informative in [10,33,50,90]:#[int(0.10*n_features),int(0.33*n_features),int(0.50*n_features),int(0.90*n_features)]:
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]={}
            print("Amount of samples = "+str(n_samples))
            print("Total used features = "+str(n_features))
            print("Informative features: "+str(int(n_informative/100*n_features))+" ("+str(n_informative)+"%)")
            print("")
            
            found_features = []
            found_idx_features = []
            times = []
            for random_seed in [0,1,2,3,4]:
                print("Seed "+str(random_seed))
                
                if regression_bool:
                    X, y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=int(n_informative/100*n_features),random_state=random_seed,shuffle=False)
                else:
                    X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=n_features, n_informative=int(n_informative/100*n_features), n_redundant=0, n_repeated = 0,shuffle=False,random_state=random_seed)
                X = pd.DataFrame(data=X, columns=[f"col_{i}" for i in range(n_features)])

                start_time = time.time()
                if regression_bool:
                    selector = PowerSHAP(
                        model = CatBoostRegressor(verbose=0, n_estimators=estimators,use_best_model=True),
                        automatic=True
                    )
                else:
                    selector = PowerSHAP(
                        model = CatBoostClassifier(verbose=0, n_estimators=estimators,use_best_model=True),
                        automatic=True
                    )
                selector.fit(X, y)
                
                times.append(time.time() - start_time)
                
                processed_shaps_df = selector._processed_shaps_df
                print(50*"-")
                
                found_features.append(len(processed_shaps_df[processed_shaps_df.p_value<0.01]))
                found_idx_features.append(processed_shaps_df[processed_shaps_df.p_value<0.01].index.values)
                
            found_informative_features = [np.sum(np.isin(X.columns.values[:int(n_informative/100*n_features)],f_list)) for f_list in found_idx_features]
            found_noise_features = [np.sum(1-np.isin(f_list,X.columns.values[:int(n_informative/100*n_features)])) for f_list in found_idx_features]
            print("Average time: "+str(np.round(np.mean(times),2))+" seconds")
            print("Found features: "+str(found_features))#len(processed_shaps_df[processed_shaps_df.p_value<0.01])))
            print("Found "+str(np.mean(found_informative_features))+" of "+str(int(n_informative/100*n_features))+" informative features")
            print(str(np.mean(found_noise_features))+" of "+str(np.mean(found_features))+" outputted powershap features are noise features")
            
            average_times.append(np.round(times,2))
            
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["informative_features"]=int(n_informative/100*n_features)
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["found_informative_features"]=found_informative_features
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["outputted_noise_features"]=found_noise_features
            
            print(100*"=")
            
        output_dict[str(n_samples)][str(n_features)]["Average time"]=average_times
        benchmark_dict_print(output_dict)
        print(100*"=")
    
output_dict_to_df(output_dict).to_csv("estimators_50_Classification_output_df.csv",index=False)

## Estimators = 250 Classification

In [None]:
from sklearn.datasets import make_classification,make_regression
from sklearn.model_selection import train_test_split
import sys
import time
#sys.path.append("../powershap")

from powershap import PowerSHAP
from catboost import CatBoostClassifier,CatBoostRegressor

#n_features = 20 #20,50,100,250,500
#n_informative = int(0.10*n_features) #10%,33%,50%,90%
#n_samples = 1000#5000

regression_bool=False
estimators = 250
hypercube = False

output_dict = {}

for n_samples in [1000,5000,20000]:
    output_dict[str(n_samples)]={}
    for n_features in [20,100,250,500]:
        output_dict[str(n_samples)][str(n_features)]={}
        
        average_times = []
        output_dict[str(n_samples)][str(n_features)]["Average time"]=average_times
        for n_informative in [10,33,50,90]:#[int(0.10*n_features),int(0.33*n_features),int(0.50*n_features),int(0.90*n_features)]:
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]={}
            print("Amount of samples = "+str(n_samples))
            print("Total used features = "+str(n_features))
            print("Informative features: "+str(int(n_informative/100*n_features))+" ("+str(n_informative)+"%)")
            print("")
            
            found_features = []
            found_idx_features = []
            times = []
            for random_seed in [0,1,2,3,4]:
                print("Seed "+str(random_seed))
                
                if regression_bool:
                    X, y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=int(n_informative/100*n_features),random_state=random_seed,shuffle=False)
                else:
                    X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=n_features, hypercube=hypercube, n_informative=int(n_informative/100*n_features), n_redundant=0, n_repeated = 0,shuffle=False,random_state=random_seed)
                X = pd.DataFrame(data=X, columns=[f"col_{i}" for i in range(n_features)])

                start_time = time.time()
                if regression_bool:
                    selector = PowerSHAP(
                        model = CatBoostRegressor(verbose=0, n_estimators=estimators,use_best_model=True),
                        automatic=True
                    )
                else:
                    selector = PowerSHAP(
                        model = CatBoostClassifier(verbose=0, n_estimators=estimators,use_best_model=True),
                        automatic=True
                    )
                selector.fit(X, y)
                
                times.append(time.time() - start_time)
                
                processed_shaps_df = selector._processed_shaps_df
                print(50*"-")
                
                found_features.append(len(processed_shaps_df[processed_shaps_df.p_value<0.01]))
                found_idx_features.append(processed_shaps_df[processed_shaps_df.p_value<0.01].index.values)
                
            found_informative_features = [np.sum(np.isin(X.columns.values[:int(n_informative/100*n_features)],f_list)) for f_list in found_idx_features]
            found_noise_features = [np.sum(1-np.isin(f_list,X.columns.values[:int(n_informative/100*n_features)])) for f_list in found_idx_features]
            print("Average time: "+str(np.round(np.mean(times),2))+" seconds")
            print("Found features: "+str(found_features))#len(processed_shaps_df[processed_shaps_df.p_value<0.01])))
            print("Found "+str(np.mean(found_informative_features))+" of "+str(int(n_informative/100*n_features))+" informative features")
            print(str(np.mean(found_noise_features))+" of "+str(np.mean(found_features))+" outputted powershap features are noise features")
            
            average_times.append(np.round(times,2))
            
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["informative_features"]=int(n_informative/100*n_features)
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["found_informative_features"]=found_informative_features
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["outputted_noise_features"]=found_noise_features
            
            print(100*"=")
            
        output_dict[str(n_samples)][str(n_features)]["Average time"]=average_times
        benchmark_dict_print(output_dict)
        print(100*"=")
    
        if hypercube:
            output_dict_to_df(output_dict).to_csv("estimators_250_Classification_output_df.csv",index=False)
        else:
            output_dict_to_df(output_dict).to_csv("estimators_250_Classification_output_df_polytone.csv",index=False)

In [None]:
from sklearn.datasets import make_classification,make_regression
from sklearn.model_selection import train_test_split
import sys
import time
#sys.path.append("../powershap")

from powershap import PowerSHAP
from catboost import CatBoostClassifier,CatBoostRegressor

#n_features = 20 #20,50,100,250,500
#n_informative = int(0.10*n_features) #10%,33%,50%,90%
#n_samples = 1000#5000

regression_bool=False
estimators = 250
hypercube = False

output_dict = {}

for n_samples in [5000]:
    output_dict[str(n_samples)]={}
    for n_features in [50,100]:
        output_dict[str(n_samples)][str(n_features)]={}
        
        average_times = []
        output_dict[str(n_samples)][str(n_features)]["Average time"]=average_times
        for n_informative in [10,33,50,90]:#[int(0.10*n_features),int(0.33*n_features),int(0.50*n_features),int(0.90*n_features)]:
            for n_redundant in [10,25,33,50]:
                output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]={}
                
                n_redundant_samples = int((n_features-int(n_informative/100*n_features))*(n_redundant/100))
                
                print("Amount of samples = "+str(n_samples))
                print("Total used features = "+str(n_features))
                print("Informative features: "+str(int(n_informative/100*n_features))+" ("+str(n_informative)+"%)")
                print("Redundant features: "+str(n_redundant_samples)+" ("+str(n_redundant)+"%)")
                print("")
                
                found_features = []
                found_idx_features = []
                times = []
                for random_seed in [0,1,2,3,4]:
                    print("Seed "+str(random_seed))

                    X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=n_features, hypercube=hypercube, n_informative=int(n_informative/100*n_features), n_redundant=n_redundant_samples, n_repeated = 0,shuffle=False,random_state=random_seed)
                    X = pd.DataFrame(data=X, columns=[f"col_{i}" for i in range(n_features)])

                    start_time = time.time()
                    if regression_bool:
                        selector = PowerSHAP(
                            model = CatBoostRegressor(verbose=0, n_estimators=estimators,use_best_model=True),
                            automatic=True
                        )
                    else:
                        selector = PowerSHAP(
                            model = CatBoostClassifier(verbose=0, n_estimators=estimators,use_best_model=True),
                            automatic=True
                        )
                    selector.fit(X, y)

                    times.append(time.time() - start_time)

                    processed_shaps_df = selector._processed_shaps_df
                    print(50*"-")

                    found_features.append(len(processed_shaps_df[processed_shaps_df.p_value<0.01]))
                    found_idx_features.append(processed_shaps_df[processed_shaps_df.p_value<0.01].index.values)

                found_informative_features = [np.sum(np.isin(X.columns.values[:int(n_informative/100*n_features)],f_list)) for f_list in found_idx_features]
                found_redundant = [np.sum(np.isin(X.columns.values[int(n_informative/100*n_features):int(n_informative/100*n_features)+n_redundant_samples],f_list)) for f_list in found_idx_features]
                found_noise_features = [np.sum(1-np.isin(f_list,X.columns.values[:int(n_informative/100*n_features)+n_redundant_samples])) for f_list in found_idx_features]
                print("Average time: "+str(np.round(np.mean(times),2))+" seconds")
                print("Found features: "+str(found_features))#len(processed_shaps_df[processed_shaps_df.p_value<0.01])))
                print("Found redundant: "+str(found_redundant) + "/"+str(n_redundant_samples))#len(processed_shaps_df[processed_shaps_df.p_value<0.01])))
                print("Found "+str(np.mean(found_informative_features))+" of "+str(int(n_informative/100*n_features))+" informative features")
                print(str(np.mean(found_noise_features))+" of "+str(np.mean(found_features))+" outputted powershap features are noise features")

                average_times.append(np.round(times,2))

                output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["informative_features"]=int(n_informative/100*n_features)
                output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["found_informative_features"]=found_informative_features
                output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["outputted_noise_features"]=found_noise_features

                print(100*"=")

        output_dict[str(n_samples)][str(n_features)]["Average time"]=average_times
        benchmark_dict_print(output_dict)
        print(100*"=")

In [None]:
from sklearn.datasets import make_classification,make_regression
from sklearn.model_selection import train_test_split
import sys
import time
#sys.path.append("../powershap")

from powershap import PowerSHAP
from catboost import CatBoostClassifier,CatBoostRegressor

#n_features = 20 #20,50,100,250,500
#n_informative = int(0.10*n_features) #10%,33%,50%,90%
#n_samples = 1000#5000

regression_bool=False
estimators = 250
hypercube = False

output_dict = {}

for n_samples in [5000]:
    output_dict[str(n_samples)]={}
    for n_features in [50]:
        output_dict[str(n_samples)][str(n_features)]={}
        
        average_times = []
        output_dict[str(n_samples)][str(n_features)]["Average time"]=average_times
        for n_informative in [33,50,90]:#[int(0.10*n_features),int(0.33*n_features),int(0.50*n_features),int(0.90*n_features)]:
            for n_redundant in [10,25,33,50]:
                output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]={}
                
                n_redundant_samples = int((n_features-int(n_informative/100*n_features))*(n_redundant/100))
                
                print("Amount of samples = "+str(n_samples))
                print("Total used features = "+str(n_features))
                print("Informative features: "+str(int(n_informative/100*n_features))+" ("+str(n_informative)+"%)")
                print("Redundant features: "+str(n_redundant_samples)+" ("+str(n_redundant)+"%)")
                print("")
                
                found_features = []
                found_idx_features = []
                times = []
                for random_seed in [0,1,2,3,4]:
                    print("Seed "+str(random_seed))

                    X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=n_features, hypercube=hypercube, n_informative=int(n_informative/100*n_features), n_redundant=n_redundant_samples, n_repeated = 0,shuffle=False,random_state=random_seed)
                    X = pd.DataFrame(data=X, columns=[f"col_{i}" for i in range(n_features)])

                    start_time = time.time()
                    # if classification is False it is a Regression problem
                    model = CatBoostClassifier(verbose=0, n_estimators=250)
                    selector = BorutaShap(model=model,importance_measure='shap',classification=True)
                    selector.fit(X, y,verbose=False)

                    times.append(time.time() - start_time)

                    subset = selector.Subset()
                    print(50*"-")

                    found_features.append(len(subset.columns))
                    found_idx_features.append(subset.columns)

                found_informative_features = [np.sum(np.isin(X.columns.values[:int(n_informative/100*n_features)],f_list)) for f_list in found_idx_features]
                found_redundant = [np.sum(np.isin(X.columns.values[int(n_informative/100*n_features):int(n_informative/100*n_features)+n_redundant_samples],f_list)) for f_list in found_idx_features]
                found_noise_features = [np.sum(1-np.isin(f_list,X.columns.values[:int(n_informative/100*n_features)+n_redundant_samples])) for f_list in found_idx_features]
                print("Average time: "+str(np.round(np.mean(times),2))+" seconds")
                print("Found features: "+str(found_features))#len(processed_shaps_df[processed_shaps_df.p_value<0.01])))
                print("Found redundant: "+str(found_redundant) + "/"+str(n_redundant_samples))#len(processed_shaps_df[processed_shaps_df.p_value<0.01])))
                print("Found "+str(np.mean(found_informative_features))+" of "+str(int(n_informative/100*n_features))+" informative features")
                print(str(np.mean(found_noise_features))+" of "+str(np.mean(found_features))+" outputted powershap features are noise features")

                average_times.append(np.round(times,2))

                output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["informative_features"]=int(n_informative/100*n_features)
                output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["found_informative_features"]=found_informative_features
                output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["outputted_noise_features"]=found_noise_features

                print(100*"=")

        output_dict[str(n_samples)][str(n_features)]["Average time"]=average_times
        benchmark_dict_print(output_dict)
        print(100*"=")

## Estimators = 500 Classification

In [None]:
from sklearn.datasets import make_classification,make_regression
from sklearn.model_selection import train_test_split
import sys
import time
#sys.path.append("../powershap")

from powershap import PowerSHAP
from catboost import CatBoostClassifier,CatBoostRegressor

regression_bool=False
estimators = 500

output_dict = {}

for n_samples in [1000,5000,20000]:
    output_dict[str(n_samples)]={}
    for n_features in [20,100,250,500]:
        output_dict[str(n_samples)][str(n_features)]={}
        
        average_times = []
        output_dict[str(n_samples)][str(n_features)]["Average time"]=average_times
        for n_informative in [10,33,50,90]:#[int(0.10*n_features),int(0.33*n_features),int(0.50*n_features),int(0.90*n_features)]:
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]={}
            print("Amount of samples = "+str(n_samples))
            print("Total used features = "+str(n_features))
            print("Informative features: "+str(int(n_informative/100*n_features))+" ("+str(n_informative)+"%)")
            print("")
            
            found_features = []
            found_idx_features = []
            times = []
            for random_seed in [0,1,2,3,4]:
                print("Seed "+str(random_seed))
                
                if regression_bool:
                    X, y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=int(n_informative/100*n_features),random_state=random_seed,shuffle=False)
                else:
                    X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=n_features, n_informative=int(n_informative/100*n_features), n_redundant=0, n_repeated = 0,shuffle=False,random_state=random_seed)
                X = pd.DataFrame(data=X, columns=[f"col_{i}" for i in range(n_features)])

                start_time = time.time()
                if regression_bool:
                    selector = PowerSHAP(
                        model = CatBoostRegressor(verbose=0, n_estimators=estimators,use_best_model=True),
                        automatic=True
                    )
                else:
                    selector = PowerSHAP(
                        model = CatBoostClassifier(verbose=0, n_estimators=estimators,use_best_model=True),
                        automatic=True
                    )
                selector.fit(X, y)
                
                times.append(time.time() - start_time)
                
                processed_shaps_df = selector._processed_shaps_df
                print(50*"-")
                
                found_features.append(len(processed_shaps_df[processed_shaps_df.p_value<0.01]))
                found_idx_features.append(processed_shaps_df[processed_shaps_df.p_value<0.01].index.values)
                
            found_informative_features = [np.sum(np.isin(X.columns.values[:int(n_informative/100*n_features)],f_list)) for f_list in found_idx_features]
            found_noise_features = [np.sum(1-np.isin(f_list,X.columns.values[:int(n_informative/100*n_features)])) for f_list in found_idx_features]
            print("Average time: "+str(np.round(np.mean(times),2))+" seconds")
            print("Found features: "+str(found_features))#len(processed_shaps_df[processed_shaps_df.p_value<0.01])))
            print("Found "+str(np.mean(found_informative_features))+" of "+str(int(n_informative/100*n_features))+" informative features")
            print(str(np.mean(found_noise_features))+" of "+str(np.mean(found_features))+" outputted powershap features are noise features")
            
            average_times.append(np.round(times,2))
            
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["informative_features"]=int(n_informative/100*n_features)
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["found_informative_features"]=found_informative_features
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["outputted_noise_features"]=found_noise_features
            
            print(100*"=")
            
        output_dict[str(n_samples)][str(n_features)]["Average time"]=average_times
        benchmark_dict_print(output_dict)
        print(100*"=")
    
        output_dict_to_df(output_dict).to_csv("estimators_500_Classification_output_df.csv",index=False)

## Estimators = 50 Regression

In [None]:
from sklearn.datasets import make_classification,make_regression
from sklearn.model_selection import train_test_split
import sys
import time
#sys.path.append("../powershap")

from powershap import PowerSHAP
from catboost import CatBoostClassifier,CatBoostRegressor

regression_bool=True
estimators = 50

output_dict = {}

for n_samples in [1000,5000,20000]:
    output_dict[str(n_samples)]={}
    for n_features in [20,100,250,500]:
        output_dict[str(n_samples)][str(n_features)]={}
        
        average_times = []
        output_dict[str(n_samples)][str(n_features)]["Average time"]=average_times
        for n_informative in [10,33,50,90]:#[int(0.10*n_features),int(0.33*n_features),int(0.50*n_features),int(0.90*n_features)]:
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]={}
            print("Amount of samples = "+str(n_samples))
            print("Total used features = "+str(n_features))
            print("Informative features: "+str(int(n_informative/100*n_features))+" ("+str(n_informative)+"%)")
            print("")
            
            found_features = []
            found_idx_features = []
            times = []
            for random_seed in [0,1,2,3,4]:
                print("Seed "+str(random_seed))
                
                if regression_bool:
                    X, y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=int(n_informative/100*n_features),random_state=random_seed,shuffle=False)
                else:
                    X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=n_features, n_informative=int(n_informative/100*n_features), n_redundant=0, n_repeated = 0,shuffle=False,random_state=random_seed)
                X = pd.DataFrame(data=X, columns=[f"col_{i}" for i in range(n_features)])

                start_time = time.time()
                if regression_bool:
                    selector = PowerSHAP(
                        model = CatBoostRegressor(verbose=0, n_estimators=estimators,use_best_model=True),
                        automatic=True
                    )
                else:
                    selector = PowerSHAP(
                        model = CatBoostClassifier(verbose=0, n_estimators=estimators,use_best_model=True),
                        automatic=True
                    )
                selector.fit(X, y)
                
                times.append(time.time() - start_time)
                
                processed_shaps_df = selector._processed_shaps_df
                print(50*"-")
                
                found_features.append(len(processed_shaps_df[processed_shaps_df.p_value<0.01]))
                found_idx_features.append(processed_shaps_df[processed_shaps_df.p_value<0.01].index.values)
                
            found_informative_features = [np.sum(np.isin(X.columns.values[:int(n_informative/100*n_features)],f_list)) for f_list in found_idx_features]
            found_noise_features = [np.sum(1-np.isin(f_list,X.columns.values[:int(n_informative/100*n_features)])) for f_list in found_idx_features]
            print("Average time: "+str(np.round(np.mean(times),2))+" seconds")
            print("Found features: "+str(found_features))#len(processed_shaps_df[processed_shaps_df.p_value<0.01])))
            print("Found "+str(np.mean(found_informative_features))+" of "+str(int(n_informative/100*n_features))+" informative features")
            print(str(np.mean(found_noise_features))+" of "+str(np.mean(found_features))+" outputted powershap features are noise features")
            
            average_times.append(np.round(times,2))
            
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["informative_features"]=int(n_informative/100*n_features)
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["found_informative_features"]=found_informative_features
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["outputted_noise_features"]=found_noise_features
            
            print(100*"=")
            
        output_dict[str(n_samples)][str(n_features)]["Average time"]=average_times
        benchmark_dict_print(output_dict)
        print(100*"=")
        
output_dict_to_df(output_dict).to_csv("estimators_50_Regression_output_df.csv",index=False)

## Estimators = 250 regression

In [None]:
from sklearn.datasets import make_classification,make_regression
from sklearn.model_selection import train_test_split
import sys
import time
#sys.path.append("../powershap")

from powershap import PowerSHAP
from catboost import CatBoostClassifier,CatBoostRegressor

#n_features = 20 #20,50,100,250,500
#n_informative = int(0.10*n_features) #10%,33%,50%,90%
#n_samples = 1000#5000

regression_bool=True
estimators = 250

output_dict = {}

for n_samples in [1000,5000,20000]:
    output_dict[str(n_samples)]={}
    for n_features in [20,100,250,500]:
        output_dict[str(n_samples)][str(n_features)]={}
        
        average_times = []
        output_dict[str(n_samples)][str(n_features)]["Average time"]=average_times
        for n_informative in [10,33,50,90]:#[int(0.10*n_features),int(0.33*n_features),int(0.50*n_features),int(0.90*n_features)]:
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]={}
            print("Amount of samples = "+str(n_samples))
            print("Total used features = "+str(n_features))
            print("Informative features: "+str(int(n_informative/100*n_features))+" ("+str(n_informative)+"%)")
            print("")
            
            found_features = []
            found_idx_features = []
            times = []
            for random_seed in [0,1,2,3,4]:
                print("Seed "+str(random_seed))
                
                if regression_bool:
                    X, y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=int(n_informative/100*n_features),random_state=random_seed,shuffle=False)
                else:
                    X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=n_features, n_informative=int(n_informative/100*n_features), n_redundant=0, n_repeated = 0,shuffle=False,random_state=random_seed)
                X = pd.DataFrame(data=X, columns=[f"col_{i}" for i in range(n_features)])

                start_time = time.time()
                if regression_bool:
                    selector = PowerSHAP(
                        model = CatBoostRegressor(verbose=0, n_estimators=estimators,use_best_model=True),
                        automatic=True
                    )
                else:
                    selector = PowerSHAP(
                        model = CatBoostClassifier(verbose=0, n_estimators=estimators,use_best_model=True),
                        automatic=True
                    )
                selector.fit(X, y)
                
                times.append(time.time() - start_time)
                
                processed_shaps_df = selector._processed_shaps_df
                print(50*"-")
                
                found_features.append(len(processed_shaps_df[processed_shaps_df.p_value<0.01]))
                found_idx_features.append(processed_shaps_df[processed_shaps_df.p_value<0.01].index.values)
                
            found_informative_features = [np.sum(np.isin(X.columns.values[:int(n_informative/100*n_features)],f_list)) for f_list in found_idx_features]
            found_noise_features = [np.sum(1-np.isin(f_list,X.columns.values[:int(n_informative/100*n_features)])) for f_list in found_idx_features]
            print("Average time: "+str(np.round(np.mean(times),2))+" seconds")
            print("Found features: "+str(found_features))#len(processed_shaps_df[processed_shaps_df.p_value<0.01])))
            print("Found "+str(np.mean(found_informative_features))+" of "+str(int(n_informative/100*n_features))+" informative features")
            print(str(np.mean(found_noise_features))+" of "+str(np.mean(found_features))+" outputted powershap features are noise features")
            
            average_times.append(np.round(times,2))
            
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["informative_features"]=int(n_informative/100*n_features)
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["found_informative_features"]=found_informative_features
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["outputted_noise_features"]=found_noise_features
            
            print(100*"=")
            
        output_dict[str(n_samples)][str(n_features)]["Average time"]=average_times
        benchmark_dict_print(output_dict)
        print(100*"=")
        
    
output_dict_to_df(output_dict).to_csv("estimators_250_Regression_output_df.csv",index=False)

## Estimators = 500 regression

In [None]:
from sklearn.datasets import make_classification,make_regression
from sklearn.model_selection import train_test_split
import sys
import time
#sys.path.append("../powershap")

from powershap import PowerSHAP
from catboost import CatBoostClassifier,CatBoostRegressor

#n_features = 20 #20,50,100,250,500
#n_informative = int(0.10*n_features) #10%,33%,50%,90%
#n_samples = 1000#5000

regression_bool=True
estimators = 500

output_dict = {}

for n_samples in [1000,5000,20000]:
    output_dict[str(n_samples)]={}
    for n_features in [20,100,250,500]:
        output_dict[str(n_samples)][str(n_features)]={}
        
        average_times = []
        output_dict[str(n_samples)][str(n_features)]["Average time"]=average_times
        for n_informative in [10,33,50,90]:#[int(0.10*n_features),int(0.33*n_features),int(0.50*n_features),int(0.90*n_features)]:
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]={}
            print("Amount of samples = "+str(n_samples))
            print("Total used features = "+str(n_features))
            print("Informative features: "+str(int(n_informative/100*n_features))+" ("+str(n_informative)+"%)")
            print("")
            
            found_features = []
            found_idx_features = []
            times = []
            for random_seed in [0,1,2,3,4]:
                print("Seed "+str(random_seed))
                
                if regression_bool:
                    X, y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=int(n_informative/100*n_features),random_state=random_seed,shuffle=False)
                else:
                    X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=n_features, n_informative=int(n_informative/100*n_features), n_redundant=0, n_repeated = 0,shuffle=False,random_state=random_seed)
                X = pd.DataFrame(data=X, columns=[f"col_{i}" for i in range(n_features)])

                start_time = time.time()
                if regression_bool:
                    selector = PowerSHAP(
                        model = CatBoostRegressor(verbose=0, n_estimators=estimators,use_best_model=True),
                        automatic=True
                    )
                else:
                    selector = PowerSHAP(
                        model = CatBoostClassifier(verbose=0, n_estimators=estimators,use_best_model=True),
                        automatic=True
                    )
                selector.fit(X, y)
                
                times.append(time.time() - start_time)
                
                processed_shaps_df = selector._processed_shaps_df
                print(50*"-")
                
                found_features.append(len(processed_shaps_df[processed_shaps_df.p_value<0.01]))
                found_idx_features.append(processed_shaps_df[processed_shaps_df.p_value<0.01].index.values)
                
            found_informative_features = [np.sum(np.isin(X.columns.values[:int(n_informative/100*n_features)],f_list)) for f_list in found_idx_features]
            found_noise_features = [np.sum(1-np.isin(f_list,X.columns.values[:int(n_informative/100*n_features)])) for f_list in found_idx_features]
            print("Average time: "+str(np.round(np.mean(times),2))+" seconds")
            print("Found features: "+str(found_features))#len(processed_shaps_df[processed_shaps_df.p_value<0.01])))
            print("Found "+str(np.mean(found_informative_features))+" of "+str(int(n_informative/100*n_features))+" informative features")
            print(str(np.mean(found_noise_features))+" of "+str(np.mean(found_features))+" outputted powershap features are noise features")
            
            average_times.append(np.round(times,2))
            
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["informative_features"]=int(n_informative/100*n_features)
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["found_informative_features"]=found_informative_features
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["outputted_noise_features"]=found_noise_features
            
            print(100*"=")
            
        output_dict[str(n_samples)][str(n_features)]["Average time"]=average_times
        benchmark_dict_print(output_dict)
        print(100*"=")
        
    
output_dict_to_df(output_dict).to_csv("estimators_500_Regression_output_df.csv",index=False)

## Chi-squared

In [19]:
from sklearn.feature_selection import chi2,f_classif,f_regression

regression_bool=False
hypercube = True

output_dict = {}

for n_samples in [1000,5000,20000]:
    output_dict[str(n_samples)]={}
    for n_features in [20,50,100,250,500]:
        output_dict[str(n_samples)][str(n_features)]={}
        
        average_times = []
        output_dict[str(n_samples)][str(n_features)]["Average time"]=average_times
        for n_informative in [10,33,50,90]:#[int(0.10*n_features),int(0.33*n_features),int(0.50*n_features),int(0.90*n_features)]:
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]={}
            print("Amount of samples = "+str(n_samples))
            print("Total used features = "+str(n_features))
            print("Informative features: "+str(int(n_informative/100*n_features))+" ("+str(n_informative)+"%)")
            print("")
            
            found_features = []
            found_idx_features = []
            times = []
            for random_seed in [0,1,2,3,4]:
                print("Seed "+str(random_seed))
                
                if regression_bool:
                    X, y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=int(n_informative/100*n_features),random_state=random_seed,shuffle=False)
                else:
                    X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=n_features, n_informative=int(n_informative/100*n_features), n_redundant=0, n_repeated = 0,hypercube=hypercube,shuffle=False,random_state=random_seed)
                X = pd.DataFrame(data=X, columns=[f"col_{i}" for i in range(n_features)])

                start_time = time.time()
                
                X = X+np.abs(X.min())

                selected_features = list(X.columns.values[np.where(chi2(X,y)[1]<0.01)[0]])

                times.append(time.time() - start_time)

                print(50*"-")
                
                found_features.append(len(selected_features))
                found_idx_features.append(selected_features)
                
            found_informative_features = [np.sum(np.isin(X.columns.values[:int(n_informative/100*n_features)],f_list)) for f_list in found_idx_features]
            found_noise_features = [np.sum(1-np.isin(f_list,X.columns.values[:int(n_informative/100*n_features)])) for f_list in found_idx_features]
            print("Average time: "+str(np.round(np.mean(times),2))+" seconds")
            print("Found features: "+str(found_features))#len(processed_shaps_df[processed_shaps_df.p_value<0.01])))
            print("Found "+str(np.mean(found_informative_features))+" of "+str(int(n_informative/100*n_features))+" informative features")
            print(str(np.mean(found_noise_features))+" of "+str(np.mean(found_features))+" outputted powershap features are noise features")
            
            average_times.append(np.round(times,2))
            
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["informative_features"]=int(n_informative/100*n_features)
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["found_informative_features"]=found_informative_features
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["outputted_noise_features"]=found_noise_features
            
            print(100*"=")
            
        output_dict[str(n_samples)][str(n_features)]["Average time"]=average_times
        benchmark_dict_print(output_dict)
        print(100*"=")
    
        if hypercube:
            output_dict_to_df(output_dict).to_csv("chisquared_output_df.csv",index=False)
        else:
            output_dict_to_df(output_dict).to_csv("chi_squared_polytone_output_df.csv",index=False)

Amount of samples = 1000
Total used features = 20
Informative features: 2 (10%)

Seed 0
--------------------------------------------------
Seed 1
--------------------------------------------------
Seed 2
--------------------------------------------------
Seed 3
--------------------------------------------------
Seed 4
--------------------------------------------------
Average time: 0.0 seconds
Found features: [1, 1, 1, 1, 1]
Found 1.0 of 2 informative features
0.0 of 1.0 outputted powershap features are noise features
Amount of samples = 1000
Total used features = 20
Informative features: 6 (33%)

Seed 0
--------------------------------------------------
Seed 1
--------------------------------------------------
Seed 2
--------------------------------------------------
Seed 3
--------------------------------------------------
Seed 4
--------------------------------------------------
Average time: 0.0 seconds
Found features: [4, 3, 4, 6, 3]
Found 4.0 of 6 informative features
0.0 of 4.0 

## F test

In [18]:
from sklearn.feature_selection import f_classif,f_regression

regression_bool=False
hypercube = True

output_dict = {}

for n_samples in [1000,5000,20000]:
    output_dict[str(n_samples)]={}
    for n_features in [20,50,100,250,500]:
        output_dict[str(n_samples)][str(n_features)]={}
        
        average_times = []
        output_dict[str(n_samples)][str(n_features)]["Average time"]=average_times
        for n_informative in [10,33,50,90]:#[int(0.10*n_features),int(0.33*n_features),int(0.50*n_features),int(0.90*n_features)]:
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]={}
            print("Amount of samples = "+str(n_samples))
            print("Total used features = "+str(n_features))
            print("Informative features: "+str(int(n_informative/100*n_features))+" ("+str(n_informative)+"%)")
            print("")
            
            found_features = []
            found_idx_features = []
            times = []
            for random_seed in [0,1,2,3,4]:
                print("Seed "+str(random_seed))
                
                if regression_bool:
                    X, y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=int(n_informative/100*n_features),random_state=random_seed,shuffle=False)
                else:
                    X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=n_features, n_informative=int(n_informative/100*n_features), n_redundant=0, n_repeated = 0,hypercube=hypercube,shuffle=False,random_state=random_seed)
                X = pd.DataFrame(data=X, columns=[f"col_{i}" for i in range(n_features)])

                start_time = time.time()

                if regression_bool:
                    selected_features = list(X.columns.values[np.where(f_classif(X,y)[1]<0.01)[0]])
                else:
                    selected_features = list(X.columns.values[np.where(f_regression(X,y)[1]<0.01)[0]])

                times.append(time.time() - start_time)

                print(50*"-")
                
                found_features.append(len(selected_features))
                found_idx_features.append(selected_features)
                
            found_informative_features = [np.sum(np.isin(X.columns.values[:int(n_informative/100*n_features)],f_list)) for f_list in found_idx_features]
            found_noise_features = [np.sum(1-np.isin(f_list,X.columns.values[:int(n_informative/100*n_features)])) for f_list in found_idx_features]
            print("Average time: "+str(np.round(np.mean(times),2))+" seconds")
            print("Found features: "+str(found_features))#len(processed_shaps_df[processed_shaps_df.p_value<0.01])))
            print("Found "+str(np.mean(found_informative_features))+" of "+str(int(n_informative/100*n_features))+" informative features")
            print(str(np.mean(found_noise_features))+" of "+str(np.mean(found_features))+" outputted powershap features are noise features")
            
            average_times.append(np.round(times,2))
            
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["informative_features"]=int(n_informative/100*n_features)
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["found_informative_features"]=found_informative_features
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["outputted_noise_features"]=found_noise_features
            
            print(100*"=")
            
        output_dict[str(n_samples)][str(n_features)]["Average time"]=average_times
        benchmark_dict_print(output_dict)
        print(100*"=")
    
        if hypercube:
            output_dict_to_df(output_dict).to_csv("f_classif_output_df.csv",index=False)
        else:
            output_dict_to_df(output_dict).to_csv("f_classif_polytone_output_df.csv",index=False)

Amount of samples = 1000
Total used features = 20
Informative features: 2 (10%)

Seed 0
--------------------------------------------------
Seed 1
--------------------------------------------------
Seed 2
--------------------------------------------------
Seed 3
--------------------------------------------------
Seed 4
--------------------------------------------------
Average time: 0.0 seconds
Found features: [1, 1, 1, 1, 2]
Found 1.0 of 2 informative features
0.2 of 1.2 outputted powershap features are noise features
Amount of samples = 1000
Total used features = 20
Informative features: 6 (33%)

Seed 0
--------------------------------------------------
Seed 1
--------------------------------------------------
Seed 2
--------------------------------------------------
Seed 3
--------------------------------------------------
Seed 4
--------------------------------------------------
Average time: 0.0 seconds
Found features: [4, 3, 4, 6, 3]
Found 4.0 of 6 informative features
0.0 of 4.0 

## LogisticRegressionCV

In [None]:
from sklearn.datasets import make_classification,make_regression
from sklearn.model_selection import train_test_split
import sys
import time
#sys.path.append("../powershap")

from powershap import PowerSHAP
from sklearn.linear_model import LogisticRegressionCV, RidgeClassifierCV,LinearRegression

#n_features = 20 #20,50,100,250,500
#n_informative = int(0.10*n_features) #10%,33%,50%,90%
#n_samples = 1000#5000

regression_bool=False
hypercube = False

output_dict = {}

for n_samples in [1000,5000,20000]:
    output_dict[str(n_samples)]={}
    for n_features in [20,100,250,500]:
        output_dict[str(n_samples)][str(n_features)]={}
        
        average_times = []
        output_dict[str(n_samples)][str(n_features)]["Average time"]=average_times
        for n_informative in [10,33,50,90]:#[int(0.10*n_features),int(0.33*n_features),int(0.50*n_features),int(0.90*n_features)]:
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]={}
            print("Amount of samples = "+str(n_samples))
            print("Total used features = "+str(n_features))
            print("Informative features: "+str(int(n_informative/100*n_features))+" ("+str(n_informative)+"%)")
            print("")
            
            found_features = []
            found_idx_features = []
            times = []
            for random_seed in [0,1,2,3,4]:
                print("Seed "+str(random_seed))
                
                if regression_bool:
                    X, y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=int(n_informative/100*n_features),random_state=random_seed,shuffle=False)
                else:
                    X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=n_features, n_informative=int(n_informative/100*n_features), n_redundant=0, n_repeated = 0,hypercube=hypercube,shuffle=False,random_state=random_seed)
                X = pd.DataFrame(data=X, columns=[f"col_{i}" for i in range(n_features)])

                start_time = time.time()

                selector = PowerSHAP(
                    model = LogisticRegressionCV(max_iter=1000),
                    automatic=True
                )
                selector.fit(X, y)
                
                times.append(time.time() - start_time)
                
                processed_shaps_df = selector._processed_shaps_df
                print(50*"-")
                
                found_features.append(len(processed_shaps_df[processed_shaps_df.p_value<0.01]))
                found_idx_features.append(processed_shaps_df[processed_shaps_df.p_value<0.01].index.values)
                
            found_informative_features = [np.sum(np.isin(X.columns.values[:int(n_informative/100*n_features)],f_list)) for f_list in found_idx_features]
            found_noise_features = [np.sum(1-np.isin(f_list,X.columns.values[:int(n_informative/100*n_features)])) for f_list in found_idx_features]
            print("Average time: "+str(np.round(np.mean(times),2))+" seconds")
            print("Found features: "+str(found_features))#len(processed_shaps_df[processed_shaps_df.p_value<0.01])))
            print("Found "+str(np.mean(found_informative_features))+" of "+str(int(n_informative/100*n_features))+" informative features")
            print(str(np.mean(found_noise_features))+" of "+str(np.mean(found_features))+" outputted powershap features are noise features")
            
            average_times.append(np.round(times,2))
            
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["informative_features"]=int(n_informative/100*n_features)
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["found_informative_features"]=found_informative_features
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["outputted_noise_features"]=found_noise_features
            
            print(100*"=")
            
        output_dict[str(n_samples)][str(n_features)]["Average time"]=average_times
        benchmark_dict_print(output_dict)
        print(100*"=")
    
        if hypercube:
            output_dict_to_df(output_dict).to_csv("logisticregressioncv_output_df.csv",index=False)
        else:
            output_dict_to_df(output_dict).to_csv("logisticregressioncv_output_df_polytone.csv",index=False)

## RandomForest

In [None]:
from sklearn.datasets import make_classification,make_regression
from sklearn.model_selection import train_test_split
import sys
import time
#sys.path.append("../powershap")

from powershap import PowerSHAP
from sklearn.ensemble import RandomForestClassifier

regression_bool=False

output_dict = {}

for n_samples in [1000,5000,20000]:
    output_dict[str(n_samples)]={}
    for n_features in [20,100,250,500]:
        output_dict[str(n_samples)][str(n_features)]={}
        
        average_times = []
        output_dict[str(n_samples)][str(n_features)]["Average time"]=average_times
        for n_informative in [10,33,50,90]:#[int(0.10*n_features),int(0.33*n_features),int(0.50*n_features),int(0.90*n_features)]:
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]={}
            print("Amount of samples = "+str(n_samples))
            print("Total used features = "+str(n_features))
            print("Informative features: "+str(int(n_informative/100*n_features))+" ("+str(n_informative)+"%)")
            print("")
            
            found_features = []
            found_idx_features = []
            times = []
            for random_seed in [0,1,2,3,4]:
                print("Seed "+str(random_seed))
                
                if regression_bool:
                    X, y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=int(n_informative/100*n_features),random_state=random_seed,shuffle=False)
                else:
                    X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=n_features, n_informative=int(n_informative/100*n_features), n_redundant=0, n_repeated = 0,shuffle=False,random_state=random_seed)
                X = pd.DataFrame(data=X, columns=[f"col_{i}" for i in range(n_features)])

                start_time = time.time()

                selector = PowerSHAP(
                    model = RandomForestClassifier(),
                    automatic=True
                )
                selector.fit(X, y)
                
                times.append(time.time() - start_time)
                
                processed_shaps_df = selector._processed_shaps_df
                print(50*"-")
                
                found_features.append(len(processed_shaps_df[processed_shaps_df.p_value<0.01]))
                found_idx_features.append(processed_shaps_df[processed_shaps_df.p_value<0.01].index.values)
                
            found_informative_features = [np.sum(np.isin(X.columns.values[:int(n_informative/100*n_features)],f_list)) for f_list in found_idx_features]
            found_noise_features = [np.sum(1-np.isin(f_list,X.columns.values[:int(n_informative/100*n_features)])) for f_list in found_idx_features]
            print("Average time: "+str(np.round(np.mean(times),2))+" seconds")
            print("Found features: "+str(found_features))#len(processed_shaps_df[processed_shaps_df.p_value<0.01])))
            print("Found "+str(np.mean(found_informative_features))+" of "+str(int(n_informative/100*n_features))+" informative features")
            print(str(np.mean(found_noise_features))+" of "+str(np.mean(found_features))+" outputted powershap features are noise features")
            
            average_times.append(np.round(times,2))
            
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["informative_features"]=int(n_informative/100*n_features)
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["found_informative_features"]=found_informative_features
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["outputted_noise_features"]=found_noise_features
            
            print(100*"=")
            
        output_dict[str(n_samples)][str(n_features)]["Average time"]=average_times
        benchmark_dict_print(output_dict)
        print(100*"=")
    
output_dict_to_df(output_dict).to_csv("randomforest_output_df.csv",index=False)

In [None]:
#Stopped simulation at 5000 samples, 20000 samples takes on average 300s per model! so at least 3000s per seed. 
output_dict_to_df(output_dict).to_csv("randomforest_output_df.csv",index=False)

## BorutaShap

In [None]:
from sklearn.datasets import make_classification,make_regression
from sklearn.model_selection import train_test_split
import sys
import time
#sys.path.append("../powershap")

from powershap import PowerSHAP
from BorutaShap import BorutaShap

#n_features = 20 #20,50,100,250,500
#n_informative = int(0.10*n_features) #10%,33%,50%,90%
#n_samples = 1000#5000

regression_bool=False

output_dict = {}

for n_samples in [1000,5000,20000]:
    output_dict[str(n_samples)]={}
    for n_features in [20,100,250,500]:
        output_dict[str(n_samples)][str(n_features)]={}
        
        average_times = []
        output_dict[str(n_samples)][str(n_features)]["Average time"]=average_times
        for n_informative in [10,33,50,90]:#[int(0.10*n_features),int(0.33*n_features),int(0.50*n_features),int(0.90*n_features)]:
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]={}
            print("Amount of samples = "+str(n_samples))
            print("Total used features = "+str(n_features))
            print("Informative features: "+str(int(n_informative/100*n_features))+" ("+str(n_informative)+"%)")
            print("")
            
            found_features = []
            found_idx_features = []
            times = []
            for random_seed in [0,1,2,3,4]:
                print("Seed "+str(random_seed))
                
                if regression_bool:
                    X, y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=int(n_informative/100*n_features),random_state=random_seed,shuffle=False)
                else:
                    X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=n_features, n_informative=int(n_informative/100*n_features), n_redundant=0, n_repeated = 0,shuffle=False,random_state=random_seed)
                X = pd.DataFrame(data=X, columns=[f"col_{i}" for i in range(n_features)])

                start_time = time.time()
                
                # if classification is False it is a Regression problem
                model = CatBoostClassifier(verbose=0, n_estimators=250)
                selector = BorutaShap(model=model,importance_measure='shap',classification=True)

                selector.fit(X=X, y=y, verbose=False)
                subset = selector.Subset()
                
                times.append(time.time() - start_time)
                print(50*"-")
                
                found_features.append(len(subset.columns))
                found_idx_features.append(subset.columns)
                
            found_informative_features = [np.sum(np.isin(X.columns.values[:int(n_informative/100*n_features)],f_list)) for f_list in found_idx_features]
            found_noise_features = [np.sum(1-np.isin(f_list,X.columns.values[:int(n_informative/100*n_features)])) for f_list in found_idx_features]
            print("Average time: "+str(np.round(np.mean(times),2))+" seconds")
            print("Found features: "+str(found_features))#len(processed_shaps_df[processed_shaps_df.p_value<0.01])))
            print("Found "+str(np.mean(found_informative_features))+" of "+str(int(n_informative/100*n_features))+" informative features")
            print(str(np.mean(found_noise_features))+" of "+str(np.mean(found_features))+" outputted powershap features are noise features")
            
            average_times.append(np.round(times,2))
            
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["informative_features"]=int(n_informative/100*n_features)
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["found_informative_features"]=found_informative_features
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["outputted_noise_features"]=found_noise_features
            
            print(100*"=")
            
        output_dict[str(n_samples)][str(n_features)]["Average time"]=average_times
        benchmark_dict_print(output_dict)
            
        output_dict_to_df(output_dict).to_csv("250_est_Catboost_borutashap_output_df.csv",index=False)
        
        print(100*"=")

## Shapicant

In [7]:
from sklearn.datasets import make_classification,make_regression
from sklearn.model_selection import train_test_split
import sys
import time
#sys.path.append("../powershap")

from powershap import PowerSHAP
import shapicant

#n_features = 20 #20,50,100,250,500
#n_informative = int(0.10*n_features) #10%,33%,50%,90%
#n_samples = 1000#5000

regression_bool=False

output_dict = {}

for n_samples in [5000]:#1000,5000,20000]:
    output_dict[str(n_samples)]={}
    for n_features in [20,100,250,500]:#[20,50,100,250,500]:
        output_dict[str(n_samples)][str(n_features)]={}
        
        average_times = []
        output_dict[str(n_samples)][str(n_features)]["Average time"]=average_times
        for n_informative in [10,33,50,90]:#[int(0.10*n_features),int(0.33*n_features),int(0.50*n_features),int(0.90*n_features)]:
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]={}
            print("Amount of samples = "+str(n_samples))
            print("Total used features = "+str(n_features))
            print("Informative features: "+str(int(n_informative/100*n_features))+" ("+str(n_informative)+"%)")
            print("")
            
            found_features = []
            found_idx_features = []
            times = []
            for random_seed in [0,1,2,3,4]:
                print("Seed "+str(random_seed))
                
                if regression_bool:
                    X, y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=int(n_informative/100*n_features),random_state=random_seed,shuffle=False)
                else:
                    X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=n_features, n_informative=int(n_informative/100*n_features), n_redundant=0, n_repeated = 0,shuffle=False,random_state=random_seed)
                X = pd.DataFrame(data=X, columns=[f"col_{i}" for i in range(n_features)])
                X["class"]=y
                X = X.reset_index()

                
                explainer_type = shap.TreeExplainer
                # if classification is False it is a Regression problem
                model = CatBoostClassifier(verbose=0, n_estimators=250,use_best_model=False)
                selector = shapicant.PandasSelector(model, explainer_type, random_state=42)

                train_idx,val_idx = train_test_split(X["index"].values,test_size=0.2,random_state = 0)

                X_train = X[X["index"].isin(train_idx)].copy(deep=True)[list(X.columns.values[1:-1])]
                X_val = X[X["index"].isin(val_idx)].copy(deep=True)[list(X.columns.values[1:-1])]
                Y_train =  X[X["index"].isin(train_idx)]["class"]

                # Run the feature selection
                # If we provide a validation set, SHAP values are computed on it, otherwise they are computed on the training set
                # We can also provide additional parameters to the underlying estimator's fit method through estimator_params
                
                start_time = time.time()
                
                selector.fit(X_train, Y_train, X_validation=X_val)

                subset = selector.get_features()
                p_values = selector.p_values_

                np.array(subset)

                times.append(time.time() - start_time)
                print(50*"-")
                
                found_features.append(len(subset))
                found_idx_features.append(subset)
                
            found_informative_features = [np.sum(np.isin(X.columns.values[:int(n_informative/100*n_features)],f_list)) for f_list in found_idx_features]
            found_noise_features = [np.sum(1-np.isin(f_list,X.columns.values[:int(n_informative/100*n_features)])) for f_list in found_idx_features]
            print("Average time: "+str(np.round(np.mean(times),2))+" seconds")
            print("Found features: "+str(found_features))#len(processed_shaps_df[processed_shaps_df.p_value<0.01])))
            print("Found "+str(np.mean(found_informative_features))+" of "+str(int(n_informative/100*n_features))+" informative features")
            print(str(np.mean(found_noise_features))+" of "+str(np.mean(found_features))+" outputted powershap features are noise features")
            
            average_times.append(np.round(times,2))
            
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["informative_features"]=int(n_informative/100*n_features)
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["found_informative_features"]=found_informative_features
            output_dict[str(n_samples)][str(n_features)][str(n_informative)+"%"]["outputted_noise_features"]=found_noise_features
            
            print(100*"=")
            
        output_dict[str(n_samples)][str(n_features)]["Average time"]=average_times
        benchmark_dict_print(output_dict)
            
        output_dict_to_df(output_dict).to_csv("shapicant_output_df.csv",index=False)
        
        print(100*"=")

Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

Amount of samples = 5000
Total used features = 20
Informative features: 2 (10%)

Seed 0


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [02:09<00:00,  1.30s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 1


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [02:12<00:00,  1.33s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 2


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [02:18<00:00,  1.38s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 3


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [02:06<00:00,  1.27s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 4


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [02:07<00:00,  1.28s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Average time: 131.1 seconds
Found features: [9, 6, 8, 5, 6]
Found 1.0 of 2 informative features
5.8 of 6.8 outputted powershap features are noise features
Amount of samples = 5000
Total used features = 20
Informative features: 6 (33%)

Seed 0


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [01:58<00:00,  1.19s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 1


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [01:58<00:00,  1.19s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 2


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [01:59<00:00,  1.20s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 3


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [01:58<00:00,  1.18s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 4


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [01:58<00:00,  1.19s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Average time: 118.89 seconds
Found features: [6, 6, 6, 6, 8]
Found 5.0 of 6 informative features
1.4 of 6.4 outputted powershap features are noise features
Amount of samples = 5000
Total used features = 20
Informative features: 10 (50%)

Seed 0


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [01:58<00:00,  1.19s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 1


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [01:58<00:00,  1.19s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 2


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [01:58<00:00,  1.19s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 3


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [01:58<00:00,  1.18s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 4


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [01:58<00:00,  1.19s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Average time: 118.71 seconds
Found features: [10, 10, 11, 10, 10]
Found 9.0 of 10 informative features
1.2 of 10.2 outputted powershap features are noise features
Amount of samples = 5000
Total used features = 20
Informative features: 18 (90%)

Seed 0


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [01:59<00:00,  1.20s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 1


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [02:01<00:00,  1.22s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 2


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [02:04<00:00,  1.24s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 3


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [01:58<00:00,  1.19s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 4


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [01:59<00:00,  1.19s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Average time: 120.72 seconds
Found features: [18, 18, 18, 18, 18]
Found 17.0 of 18 informative features
1.0 of 18.0 outputted powershap features are noise features
5000: 
20: 
[129.96 132.72 138.2  126.62 128.01]s - [118.57 118.89 119.58 118.4  119.01]s - [118.78 118.95 118.63 118.46 118.72]s - [119.67 121.64 124.11 118.92 119.25]s | [1, 1, 1, 1, 1] ([8, 5, 7, 4, 5]) / 2 | [5, 5, 5, 5, 5] ([1, 1, 1, 1, 3]) / 6 | [9, 9, 9, 9, 9] ([1, 1, 2, 1, 1]) / 10 | [17, 17, 17, 17, 17] ([1, 1, 1, 1, 1]) / 18 | 
Amount of samples = 5000
Total used features = 100
Informative features: 10 (10%)

Seed 0


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [05:22<00:00,  3.23s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 1


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [05:18<00:00,  3.19s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 2


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [05:41<00:00,  3.41s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 3


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [05:29<00:00,  3.29s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 4


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [05:52<00:00,  3.52s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Average time: 332.93 seconds
Found features: [14, 10, 12, 11, 10]
Found 9.0 of 10 informative features
2.4 of 11.4 outputted powershap features are noise features
Amount of samples = 5000
Total used features = 100
Informative features: 33 (33%)

Seed 0


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [05:56<00:00,  3.56s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 1


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [06:12<00:00,  3.72s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 2


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [06:05<00:00,  3.66s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 3


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [06:04<00:00,  3.65s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 4


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [06:02<00:00,  3.62s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Average time: 364.36 seconds
Found features: [32, 33, 32, 32, 33]
Found 31.4 of 33 informative features
1.0 of 32.4 outputted powershap features are noise features
Amount of samples = 5000
Total used features = 100
Informative features: 50 (50%)

Seed 0


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [06:18<00:00,  3.78s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 1


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [06:15<00:00,  3.76s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 2


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [06:07<00:00,  3.67s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 3


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [06:06<00:00,  3.67s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 4


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [06:15<00:00,  3.76s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Average time: 372.76 seconds
Found features: [42, 40, 45, 45, 43]
Found 42.0 of 50 informative features
1.0 of 43.0 outputted powershap features are noise features
Amount of samples = 5000
Total used features = 100
Informative features: 90 (90%)

Seed 0


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [06:13<00:00,  3.73s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 1


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [06:15<00:00,  3.75s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 2


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [06:12<00:00,  3.72s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 3


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [06:14<00:00,  3.74s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 4


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [05:41<00:00,  3.41s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Average time: 367.28 seconds
Found features: [57, 61, 46, 55, 62]
Found 55.2 of 90 informative features
1.0 of 56.2 outputted powershap features are noise features
5000: 
20: 
[129.96 132.72 138.2  126.62 128.01]s - [118.57 118.89 119.58 118.4  119.01]s - [118.78 118.95 118.63 118.46 118.72]s - [119.67 121.64 124.11 118.92 119.25]s | [1, 1, 1, 1, 1] ([8, 5, 7, 4, 5]) / 2 | [5, 5, 5, 5, 5] ([1, 1, 1, 1, 3]) / 6 | [9, 9, 9, 9, 9] ([1, 1, 2, 1, 1]) / 10 | [17, 17, 17, 17, 17] ([1, 1, 1, 1, 1]) / 18 | 
100: 
[322.99 318.8  341.21 329.51 352.14]s - [356.4  372.1  365.94 364.89 362.47]s - [378.24 375.55 367.15 366.87 376.  ]s - [373.45 375.08 372.11 374.3  341.48]s | [9, 9, 9, 9, 9] ([5, 1, 3, 2, 1]) / 10 | [31, 32, 31, 31, 32] ([1, 1, 1, 1, 1]) / 33 | [41, 39, 43, 45, 42] ([1, 1, 2, 0, 1]) / 50 | [56, 60, 45, 54, 61] ([1, 1, 1, 1, 1]) / 90 | 
Amount of samples = 5000
Total used features = 250
Informative features: 25 (10%)

Seed 0


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [11:54<00:00,  7.15s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 1


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [11:35<00:00,  6.95s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 2


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [11:54<00:00,  7.15s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 3


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [12:36<00:00,  7.57s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 4


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [13:32<00:00,  8.12s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Average time: 738.67 seconds
Found features: [28, 25, 27, 26, 25]
Found 24.0 of 25 informative features
2.2 of 26.2 outputted powershap features are noise features
Amount of samples = 5000
Total used features = 250
Informative features: 82 (33%)

Seed 0


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [31:02<00:00, 18.63s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 1


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [10:59<00:00,  6.59s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 2


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [10:55<00:00,  6.56s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 3


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [11:09<00:00,  6.69s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 4


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [10:54<00:00,  6.55s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Average time: 900.46 seconds
Found features: [62, 63, 62, 68, 64]
Found 62.8 of 82 informative features
1.0 of 63.8 outputted powershap features are noise features
Amount of samples = 5000
Total used features = 250
Informative features: 125 (50%)

Seed 0


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [10:56<00:00,  6.56s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 1


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [11:01<00:00,  6.61s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 2


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [10:53<00:00,  6.54s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 3


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [10:51<00:00,  6.51s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 4


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [10:48<00:00,  6.48s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Average time: 654.19 seconds
Found features: [75, 66, 74, 79, 76]
Found 73.2 of 125 informative features
0.8 of 74.0 outputted powershap features are noise features
Amount of samples = 5000
Total used features = 250
Informative features: 225 (90%)

Seed 0


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [10:47<00:00,  6.48s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 1


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [10:48<00:00,  6.48s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 2


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [10:47<00:00,  6.48s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 3


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [10:47<00:00,  6.47s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 4


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [10:44<00:00,  6.44s/it]


--------------------------------------------------
Average time: 647.16 seconds
Found features: [86, 83, 77, 70, 81]
Found 78.6 of 225 informative features
0.8 of 79.4 outputted powershap features are noise features
5000: 
20: 
[129.96 132.72 138.2  126.62 128.01]s - [118.57 118.89 119.58 118.4  119.01]s - [118.78 118.95 118.63 118.46 118.72]s - [119.67 121.64 124.11 118.92 119.25]s | [1, 1, 1, 1, 1] ([8, 5, 7, 4, 5]) / 2 | [5, 5, 5, 5, 5] ([1, 1, 1, 1, 3]) / 6 | [9, 9, 9, 9, 9] ([1, 1, 2, 1, 1]) / 10 | [17, 17, 17, 17, 17] ([1, 1, 1, 1, 1]) / 18 | 
100: 
[322.99 318.8  341.21 329.51 352.14]s - [356.4  372.1  365.94 364.89 362.47]s - [378.24 375.55 367.15 366.87 376.  ]s - [373.45 375.08 372.11 374.3  341.48]s | [9, 9, 9, 9, 9] ([5, 1, 3, 2, 1]) / 10 | [31, 32, 31, 31, 32] ([1, 1, 1, 1, 1]) / 33 | [41, 39, 43, 45, 42] ([1, 1, 2, 0, 1]) / 50 | [56, 60, 45, 54, 61] ([1, 1, 1, 1, 1]) / 90 | 
250: 
[714.77 695.09 714.55 756.88 812.07]s - [1863.01  659.39  655.92  669.34  654.66]s - [656.42

Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [23:39<00:00, 14.20s/it]
  0%|                                                                                          | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 1


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [25:21<00:00, 15.21s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 2


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [22:20<00:00, 13.40s/it]


--------------------------------------------------
Seed 3


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [22:28<00:00, 13.48s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 4


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [21:21<00:00, 12.81s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Average time: 1382.18 seconds
Found features: [50, 49, 48, 49, 52]
Found 47.6 of 50 informative features
2.0 of 49.6 outputted powershap features are noise features
Amount of samples = 5000
Total used features = 500
Informative features: 165 (33%)

Seed 0


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [22:00<00:00, 13.20s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 1


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [22:44<00:00, 13.65s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 2


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [20:33<00:00, 12.34s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 3


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [20:31<00:00, 12.31s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 4


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [20:31<00:00, 12.31s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Average time: 1276.27 seconds
Found features: [92, 88, 92, 100, 88]
Found 91.0 of 165 informative features
1.0 of 92.0 outputted powershap features are noise features
Amount of samples = 5000
Total used features = 500
Informative features: 250 (50%)

Seed 0


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [20:27<00:00, 12.27s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 1


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [20:26<00:00, 12.26s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 2


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [20:30<00:00, 12.30s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 3


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [20:39<00:00, 12.40s/it]
  0%|                                                                                          | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 4


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [20:28<00:00, 12.29s/it]
  0%|                                                                                          | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Average time: 1230.61 seconds
Found features: [104, 101, 94, 88, 96]
Found 96.0 of 250 informative features
0.6 of 96.6 outputted powershap features are noise features
Amount of samples = 5000
Total used features = 500
Informative features: 450 (90%)

Seed 0


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [20:25<00:00, 12.26s/it]
  0%|                                                                                          | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 1


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [20:26<00:00, 12.26s/it]
Computing true SHAP values:   0%|                                                              | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 2


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [20:30<00:00, 12.31s/it]


--------------------------------------------------
Seed 3


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [20:25<00:00, 12.26s/it]
  0%|                                                                                          | 0/100 [00:00<?, ?it/s]

--------------------------------------------------
Seed 4


Computing null SHAP values: 100%|████████████████████████████████████████████████████| 100/100 [20:25<00:00, 12.26s/it]

--------------------------------------------------
Average time: 1226.84 seconds
Found features: [93, 103, 93, 98, 103]
Found 97.8 of 450 informative features
0.2 of 98.0 outputted powershap features are noise features
5000: 
20: 
[129.96 132.72 138.2  126.62 128.01]s - [118.57 118.89 119.58 118.4  119.01]s - [118.78 118.95 118.63 118.46 118.72]s - [119.67 121.64 124.11 118.92 119.25]s | [1, 1, 1, 1, 1] ([8, 5, 7, 4, 5]) / 2 | [5, 5, 5, 5, 5] ([1, 1, 1, 1, 3]) / 6 | [9, 9, 9, 9, 9] ([1, 1, 2, 1, 1]) / 10 | [17, 17, 17, 17, 17] ([1, 1, 1, 1, 1]) / 18 | 
100: 
[322.99 318.8  341.21 329.51 352.14]s - [356.4  372.1  365.94 364.89 362.47]s - [378.24 375.55 367.15 366.87 376.  ]s - [373.45 375.08 372.11 374.3  341.48]s | [9, 9, 9, 9, 9] ([5, 1, 3, 2, 1]) / 10 | [31, 32, 31, 31, 32] ([1, 1, 1, 1, 1]) / 33 | [41, 39, 43, 45, 42] ([1, 1, 2, 0, 1]) / 50 | [56, 60, 45, 54, 61] ([1, 1, 1, 1, 1]) / 90 | 
250: 
[714.77 695.09 714.55 756.88 812.07]s - [1863.01  659.39  655.92  669.34  654.66]s - [656


