In [1]:
from allib.datasets import load_uci, AVAIL_DATASETS
from allib.metrics import distance
from allib.utils import ensure_path
from allib.models.al import get_al_strategy
from sklearn.metrics.pairwise import pairwise_distances, check_pairwise_arrays
from allib.metrics import get_metrics
from allib.plots import PLMetric
from allib.models import get_pipeline, AVAIL_MODELS
import numpy as np
import os
import pickle

In [2]:
DIST_CACHE = "./dist_cache"

In [8]:
iris = load_uci('iris')
iris.with_preprocess(steps=["sample_n", "continuous_to_categorical", "remove_constant_columns"],  params_list=[{"n": 1000, "random_state": 0}, {"encode": "ordinal"}, {}], in_place=True)
dsn = "iris"
model_name = "catboost"
gsx = get_al_strategy("gsx")
for metric in distance.AVAIL_DIST_METRICS:
# for metric in ["cosine", "euclidean"]:
    fn = f"{DIST_CACHE}/{dsn}/{metric}_ordinal.npy"
    cache_name = f"{dsn.replace('/', '_')}@{model_name}@gsx_{metric}@x20.pkl"
    if ensure_path(os.path.join("ppl_cache", cache_name), False):
        print(f"exp {cache_name} already exists, continue")
        continue
    if not ensure_path(fn, False):
        print(f"Computing {metric} for {dsn}")
    print(f"Using cache of {metric} for {dsn}")
    setattr(gsx, "dist_cache_path", fn)
    ds = iris.with_strategy(gsx, extra_params={"distance_metric": metric})
    make_ppl = get_pipeline(model_name)
    ppl = make_ppl(
        model=None,
        eval_metrics=get_metrics(["accuracy"]),
        seeds=[i for i in range(20)],
        n_times=20,
        dataset=ds,
        cat_idx=ds.info["cat_idx"]
    )
    ppl.start()
    with open(os.path.join("ppl_cache", cache_name), "wb") as f:
        pickle.dump(ppl.stats, f)


exp iris@catboost@gsx_euclidean@x20.pkl already exists, continue
exp iris@catboost@gsx_cosine@x20.pkl already exists, continue
Using cache of overlap for iris


100%|██████████| 20/20 [02:02<00:00,  6.14s/it]


Using cache of eskin for iris


100%|██████████| 20/20 [02:07<00:00,  6.38s/it]


Using cache of iof for iris


100%|██████████| 20/20 [02:25<00:00,  7.27s/it]


Using cache of of for iris


100%|██████████| 20/20 [02:19<00:00,  6.97s/it]


Using cache of lin for iris


100%|██████████| 20/20 [02:20<00:00,  7.02s/it]


Using cache of lin1 for iris


100%|██████████| 20/20 [02:19<00:00,  6.97s/it]


Using cache of goodall1 for iris


100%|██████████| 20/20 [02:18<00:00,  6.90s/it]


Using cache of goodall2 for iris


100%|██████████| 20/20 [02:23<00:00,  7.17s/it]


Using cache of goodall3 for iris


100%|██████████| 20/20 [02:30<00:00,  7.55s/it]


Using cache of goodall4 for iris


100%|██████████| 20/20 [02:24<00:00,  7.21s/it]


Using cache of smirnov for iris


100%|██████████| 20/20 [02:20<00:00,  7.05s/it]


Using cache of gambaryan for iris


100%|██████████| 20/20 [02:16<00:00,  6.81s/it]


Using cache of burnaby for iris


100%|██████████| 20/20 [02:12<00:00,  6.60s/it]


Using cache of anderberg for iris


100%|██████████| 20/20 [02:14<00:00,  6.74s/it]


In [11]:
def plot(dataset_name: str, model_name: str):
    metrics_n_times = []
    instances = []
    strategies = list(distance.AVAIL_DIST_METRICS.keys())
    plot_name = f"{dataset_name}@{model_name}.png"
    for metric in strategies:
        stats = None
        cache_name = f"{dataset_name.replace('/', '_')}@{model_name}@gsx_{metric}@x20.pkl"
        if not os.path.isfile(os.path.join("ppl_cache", cache_name)):
            print(f"exp {cache_name} does not exist, continue")
            continue
        with open(os.path.join("ppl_cache", cache_name), "rb") as f:
            stats = pickle.load(f)
        metrics_n_times.append([stats[i]["accuracy"] for i in range(len(stats))])
        instances = stats[0]["instances"]
    pl_metric = PLMetric()
    pl_metric.plot("Accuracy", instances, np.array(metrics_n_times), strategies, plot_name=plot_name)
    # pl_metric = PLMetric("Accuracy", instances, metrics_n_times, strategies, plot_name=plot_name)

In [12]:
plot("iris", "catboost")