In [2]:
from os.path import join as pjoin
from ms.utils.ms_utils import get_project_path
import os
import pandas as pd

In [3]:
path_to_features = pjoin(get_project_path(), "resources", "tabzilla", "preprocessed_test")
features_files_names = os.listdir(path_to_features)
features_files = [pjoin(path_to_features, i) for i in os.listdir(path_to_features)]
path_to_target = pjoin(get_project_path(), "resources", "tabzilla")
target_folders = ["absquantile2", "absquantile3", "absuniform2", "absuniform3", "rel_xgboost_resnet", "target_diff"]
target_files = []
for folder in target_folders:
    target_files.append(pjoin(path_to_target, f"{folder}_test", "metrics.csv"))

In [4]:
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

lr = {
    "name": "LogReg",
    "model": LogisticRegression(penalty='l2', solver="lbfgs", max_iter=100),
    "params": {
        "penalty": ["l2", "none"],
        "C": [0.01, 0.1, 1, 10],
        "solver": ["newton-cholesky", "lbfgs", "sag", "saga"],
    }
}

mlp = {
    "name": "MLP",
    "model": MLPClassifier(
        hidden_layer_sizes=(50,),
        activation="relu",
        solver="adam",
        alpha=0.0001,
        batch_size="auto",
        learning_rate="constant",
        learning_rate_init=0.001,
        max_iter=200,
    ),
    "params": {
        "hidden_layer_sizes": [(10,), (50,), (100,)],
        "activation": ["logistic", "tanh", "relu"],
        #"solver": ["lbfgs", "sgd", "adam"],
        "alpha": [0.0001, 0.001, 0.01],
        #"batch_size": ["auto", 100, 200, 300],
        "learning_rate": ["constant", "invscaling", "adaptive"],
        "learning_rate_init": [0.001, 0.01, 0.1],
        #"max_iter": [200, 400, 600],
    },
}

xgb = {
    "name": "XGB",
    "model": xgb.XGBClassifier(),
    "params": { 
        'max_depth': [3, 5, 7, 9],
        'learning_rate': [0.01, 0.1, 0.3],
        'n_estimators': [5, 10, 50, 100],
        "eval_metric" : ["merror", "mlogloss"], 
    }
}

metamodels = [lr, mlp, xgb]

In [16]:
all_dfs = []
for i, feature_file in enumerate(features_files):
    feature_folder_name = features_files_names[i][:-4]
    for j, target_file in enumerate(target_files):
        target_folder_name = target_folders[j]
        for metamodel in metamodels:
            df = pd.read_csv(
                pjoin(get_project_path(), "resources", "tabzilla", f"{target_folder_name}_test", feature_folder_name, f"{metamodel['name']}_results.csv"), index_col=0
            )
            mean_df = pd.DataFrame(df.mean(),).T#columns=[f"{feature_folder_name[13:]}_{target_folder_name}_{metamodel['name']}"])
            index = pd.MultiIndex.from_arrays([
                [feature_folder_name[13:]],
                [target_folder_name],
                [metamodel['name']]
            ], names=["preproc", "target", "metamodel"])
            mean_df.index = index
            all_dfs.append(mean_df)

In [17]:
all_dfs[0]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,fit_time_mean,score_time_mean,test_b_acc_mean,test_f1_mean,test_prec_mean,test_rec_mean,test_roc_mean,fit_time_std,score_time_std,test_b_acc_std,test_f1_std,test_prec_std,test_rec_std,test_roc_std
preproc,target,metamodel,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
minmax,absquantile2,LogReg,0.017902,0.011845,0.439286,0.42688,0.43273,0.44,0.439286,0.002116,0.001482,0.107651,0.111921,0.117983,0.107643,0.107651


In [21]:
all_dfs = pd.concat(all_dfs, axis=0)
all_dfs.drop(["fit_time_mean", "score_time_mean"], axis=1, inplace=True)

In [26]:
all_dfs.sort_values(by="test_b_acc_mean", ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,test_b_acc_mean,test_f1_mean,test_prec_mean,test_rec_mean,test_roc_mean,fit_time_std,score_time_std,test_b_acc_std,test_f1_std,test_prec_std,test_rec_std,test_roc_std
preproc,target,metamodel,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
standard,absquantile2,XGB,0.541667,0.528826,0.548504,0.541905,0.541667,0.000661,0.000661,0.108572,0.112157,0.124813,0.108027,0.108572
minmax,absquantile2,XGB,0.535863,0.519185,0.545787,0.535873,0.535863,0.062866,0.011318,0.130516,0.134994,0.151281,0.131582,0.130516
standard,absquantile2,LogReg,0.522917,0.511288,0.527518,0.521587,0.522917,0.000718,0.000724,0.120422,0.124596,0.134411,0.120154,0.120422
standard,absquantile2,MLP,0.522024,0.509914,0.523345,0.521032,0.522024,0.004182,0.000517,0.125623,0.129268,0.134498,0.125893,0.125623
outliers,absquantile2,XGB,0.520833,0.504433,0.527025,0.520238,0.520833,0.001853,0.001497,0.119629,0.123134,0.147453,0.120090,0.119629
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
power,absquantile3,LogReg,0.312778,0.295897,0.306661,0.312381,,0.014529,0.002473,0.094160,0.094153,0.111062,0.093656,
outliers,target_diff,LogReg,0.303810,0.357320,0.318816,0.430000,0.427083,0.005290,0.000699,0.131794,0.082490,0.117928,0.092393,
power,absquantile3,MLP,0.298056,0.281746,0.295715,0.298571,,0.007013,0.000675,0.102489,0.104604,0.130054,0.102715,
standard,absquantile3,MLP,0.295833,0.290510,0.308677,0.296111,,0.003333,0.000528,0.125923,0.129115,0.144013,0.127325,
