1. Download files from here: https://drive.google.com/drive/folders/1LR-ftaIeV6_KJvVz8q-xbodA-oXtJuvV?usp=sharing
2. Place features.csv and metrics.csv to the following path from project root: resources/tabzilla/raw
3. Run this notebook

In [1]:
from ms.handler.metadata_source import TabzillaSource
from ms.metadataset.metadata_formatter import TabzillaFormatter
from ms.metadataset.metadata_filter import TabzillaFilter
from ms.metadataset.target_builder import TargetPerfBuilder, TargetDiffBuilder
from ms.metadataset.metadata_preprocessor import ScalePreprocessor, CorrelationPreprocessor

In [16]:
md_source = TabzillaSource()
metric_name = "F1__test"

model_classes = {
    "rtdl_FTTransformer": "nn",
    "rtdl_MLP": "nn",
    "rtdl_ResNet": "nn",
    "LinearModel": "classic",
    "RandomForest": "classic",
    "XGBoost": "classic"
}

classes_names = ["nn", "classic"]

Formatter handles raw TabZilla files performing fold values aggregation and metrics formatting.

Formatted files will be saved here: resources/tabzilla/formatted

In [17]:
formatter = TabzillaFormatter(
        features_folder="raw",
        metrics_folder="raw",
        test_mode=False,
    )
formatted_features = formatter.handle_features(to_save=True).shape
formatted_metrics = formatter.handle_metrics(to_save=True).shape

print(formatted_features)
print(formatted_metrics)

(176, 1604)
(3246, 16)


Filter performs removal of unsuitable features

Filtered files will be saved here: resources/tabzilla/filtered

In [18]:
md_filter = TabzillaFilter(
    features_folder="formatted",
    metrics_folder="formatted",
    funcs_to_exclude=[
        "count",
        "histogram",
        "iq_range",
        "median",
        "quantiles",
        "range",
    ],
    models_list=["XGBoost", "RandomForest", "LinearModel",
                     "rtdl_ResNet", "rtdl_FTTransformer", "rtdl_MLP"],
    test_mode=False,
    value_threshold=1e6,
)

filtered_features = md_filter.handle_features(to_save=True).shape
filtered_metrics = md_filter.handle_metrics(to_save=True).shape

print(filtered_features)
print(filtered_metrics)

(165, 217)
(852, 18)


Target builder creates target with specific strategy (rank of absolute or relative performance, difference between best performing models)

Targets will be saved here: resources/tabzilla/target

In [19]:
abs_perf_builder = TargetPerfBuilder(
    md_source=md_source,
    features_folder="filtered",
    metrics_folder="filtered",
    metric_name=metric_name,
    perf_type="abs",
    n_bins=2,
    strategy="quantile",
    test_mode=False,
)

rel_perf_builder = TargetPerfBuilder(
    md_source=md_source,
    features_folder="filtered",
    metrics_folder="filtered",
    metric_name=metric_name,
    perf_type="rel",
    n_bins=3,
    strategy="uniform",
    test_mode=False,
)

diff_builder = TargetDiffBuilder(
    classes=classes_names,
    model_classes=model_classes,
    md_source=md_source,
    features_folder="filtered",
    metrics_folder="filtered",
    metric_name=metric_name,
    n_bins=3,
    strategy="uniform",
    test_mode=False,
)

abs_perf = abs_perf_builder.handle_metrics().shape
rel_perf = rel_perf_builder.handle_metrics().shape
diff = diff_builder.handle_metrics().shape

print(abs_perf)
print(rel_perf)
print(diff)

(142, 6)
(142, 6)
(142, 1)


 [1.]
 [1.]
 [0.]
 [2.]
 [2.]
 [1.]
 [2.]
 [2.]
 [1.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [1.]
 [2.]
 [1.]
 [2.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [2.]
 [1.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [0.]
 [1.]
 [0.]
 [1.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [2.]
 [1.]
 [0.]
 [0.]
 [1.]
 [2.]
 [1.]
 [2.]
 [2.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [2.]
 [2.]
 [1.]
 [1.]
 [0.]
 [2.]
 [1.]
 [1.]
 [2.]
 [2.]
 [2.]
 [0.]
 [2.]
 [2.]
 [2.]
 [2.]
 [0.]
 [1.]
 [1.]
 [2.]
 [1.]
 [1.]
 [0.]
 [0.]
 [2.]
 [1.]
 [0.]
 [2.]
 [1.]
 [1.]
 [2.]
 [2.]
 [0.]
 [2.]
 [2.]
 [1.]
 [1.]
 [0.]
 [2.]
 [2.]
 [0.]
 [0.]
 [1.]
 [2.]
 [0.]
 [2.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [0.]
 [2.]
 [1.]
 [0.]
 [1.]
 [1.]
 [1.]
 [2.]
 [1.]
 [0.]
 [1.]
 [0.]
 [2.]
 [0.]
 [2.]
 [1.]
 [0.]
 [1.]
 [1.]
 [0.]
 [2.]
 [0.]
 [0.]
 [1.]
 [2.]
 [2.]]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  diff_df.iloc[:, 0] = disc.fit_transform(X=diff_df)


Preproccesor performs data scaling with specific target. You can choose target type by passing suffix argument into preprocess method (suffix should correspond to one of the files in target folder)

Preprocessed data will be saved here: resources/tabzilla/preprocessed

In [20]:
scaler = ScalePreprocessor(
    md_source=md_source,
    features_folder="filtered",
    metrics_folder="target",
    to_scale=["power"],
    perf_type="abs",
    remove_outliers=False,
    test_mode=False,
)
scaled_features, scaled_metrics = scaler.preprocess(
    feature_suffix=None,
    metrics_suffix="perf_abs"
)
scaler.preprocess(
    feature_suffix=None,
    metrics_suffix="perf_rel"
)
scaler.preprocess(
    feature_suffix=None,
    metrics_suffix="diff"
)
print(scaled_features.shape)
print(scaled_metrics.shape)

(134, 217)
(134, 6)


In [21]:
corr_filter = CorrelationPreprocessor(
    md_source=md_source,
    features_folder="preprocessed",
    metrics_folder="preprocessed",
    corr_method="spearman",
    corr_value_threshold=0.9,
    vif_value_threshold=20000,
    vif_count_threshold=None,
    test_mode=False,
)

corr_features, corr_metrics = corr_filter.preprocess(
    feature_suffix="power",
    metrics_suffix="perf_abs"
)
corr_filter.preprocess(
    feature_suffix="power",
    metrics_suffix="perf_rel"
)
corr_filter.preprocess(
    feature_suffix="power",
    metrics_suffix="diff"
)
print(corr_features.shape)
print(corr_metrics.shape)

(134, 123)
(134, 6)


In [1]:
from ms.metadataset.features_sampler import FeaturesSampler

f_sampler = FeaturesSampler(
    md_source=md_source,
    features_folder="preprocessed",
    metrics_folder="preprocessed",
    test_mode=False
)
f_sampler.sample_features(feature_suffixes=["power"])

NameError: name 'md_source' is not defined