In [1]:
from allib.datasets import load_uci, AVAIL_DATASETS
from allib.metrics import distance
from allib.utils import ensure_path
from allib.models.al import get_al_strategy
from sklearn.metrics.pairwise import pairwise_distances, check_pairwise_arrays
from allib.metrics import get_metrics
from allib.plots import PLMetric
from allib.models import get_pipeline, AVAIL_MODELS
import numpy as np
import os
import pickle

In [2]:
DIST_CACHE = "../../examples/dist_cache"

In [3]:
for dsn in AVAIL_DATASETS:
    ds = load_uci(dsn)
    ds.with_preprocess(steps=["sample_n", "continuous_to_categorical", "remove_constant_columns"],  params_list=[{"n": 1000, "random_state": 0}, {"encode": "ordinal"}, {}], in_place=True)
    model_name = "catboost"
    gsx = get_al_strategy("gsx")
    for metric in distance.AVAIL_DIST_METRICS:
    # for metric in ["cosine", "euclidean"]:
        fn = f"{DIST_CACHE}/{dsn}/{metric}_ordinal.npy"
        cache_name = f"{dsn.replace('/', '_')}@{model_name}@gsx_{metric}@x20.pkl"
        if ensure_path(os.path.join("ppl_cache", cache_name), False):
            print(f"exp {cache_name} already exists, continue")
            continue
        if not ensure_path(fn, False):
            print(f"{metric} for {dsn} not found, skipping ...")
            continue
        print(f"Using cache of {metric} for {dsn}")
        setattr(gsx, "dist_cache_path", fn)
        ds = ds.with_strategy(gsx, extra_params={"distance_metric": metric})
        make_ppl = get_pipeline(model_name)
        ppl = make_ppl(
            model=None,
            eval_metrics=get_metrics(["accuracy"]),
            seeds=[i for i in range(20)],
            n_times=20,
            dataset=ds,
            cat_idx=ds.info["cat_idx"]
        )
        ppl.start()
        with open(os.path.join("ppl_cache", cache_name), "wb") as f:
            pickle.dump(ppl.stats, f)


exp iris@catboost@gsx_euclidean@x20.pkl already exists, continue
exp iris@catboost@gsx_cosine@x20.pkl already exists, continue
exp iris@catboost@gsx_overlap@x20.pkl already exists, continue
exp iris@catboost@gsx_eskin@x20.pkl already exists, continue
exp iris@catboost@gsx_iof@x20.pkl already exists, continue
exp iris@catboost@gsx_of@x20.pkl already exists, continue
exp iris@catboost@gsx_lin@x20.pkl already exists, continue
exp iris@catboost@gsx_lin1@x20.pkl already exists, continue
exp iris@catboost@gsx_goodall1@x20.pkl already exists, continue
exp iris@catboost@gsx_goodall2@x20.pkl already exists, continue
exp iris@catboost@gsx_goodall3@x20.pkl already exists, continue
exp iris@catboost@gsx_goodall4@x20.pkl already exists, continue
exp iris@catboost@gsx_smirnov@x20.pkl already exists, continue
exp iris@catboost@gsx_gambaryan@x20.pkl already exists, continue
exp iris@catboost@gsx_burnaby@x20.pkl already exists, continue
exp iris@catboost@gsx_anderberg@x20.pkl already exists, continue
e



In [12]:
import matplotlib.pyplot as plt  

def plot(dataset_name: str, model_name: str, strategies: list = None, plot_name: str = None):
    metrics_n_times = []
    instances = []
    strategies = strategies or list(distance.AVAIL_DIST_METRICS.keys())
    plot_name = plot_name if plot_name is not None else f"{dataset_name}@{model_name}.png"
    if ensure_path(os.path.join("./plots/pl_metric", plot_name), False):
        print(f"plot {plot_name} already exists, continue")
        return
    for metric in strategies:
        stats = None
        cache_name = f"{dataset_name.replace('/', '_')}@{model_name}@gsx_{metric}@x20.pkl"
        if not os.path.isfile(os.path.join("ppl_cache", cache_name)):
            print(f"exp {cache_name} does not exist, continue")
            continue
        with open(os.path.join("ppl_cache", cache_name), "rb") as f:
            stats = pickle.load(f)
        metrics_n_times.append([stats[i]["accuracy"] for i in range(len(stats))])
        instances = stats[0]["instances"]
    pl_metric = PLMetric()
    pl_metric.plot("Accuracy", instances, np.array(metrics_n_times), strategies, plot_name=plot_name, cmap=plt.get_cmap("Paired"), dpi=300)

In [4]:
# plot("adult", "catboost")
for dsn in AVAIL_DATASETS:
    plot(dsn, "catboost", strategies=distance.AVAIL_DIST_METRICS.keys())

exp adult@catboost@gsx_lin1@x20.pkl does not exist, continue
exp adult@catboost@gsx_goodall1@x20.pkl does not exist, continue
exp adult@catboost@gsx_goodall2@x20.pkl does not exist, continue
exp adult@catboost@gsx_goodall3@x20.pkl does not exist, continue
exp adult@catboost@gsx_goodall4@x20.pkl does not exist, continue
exp adult@catboost@gsx_smirnov@x20.pkl does not exist, continue
exp adult@catboost@gsx_gambaryan@x20.pkl does not exist, continue
exp adult@catboost@gsx_burnaby@x20.pkl does not exist, continue
exp adult@catboost@gsx_anderberg@x20.pkl does not exist, continue
exp letter-recognition@catboost@gsx_goodall1@x20.pkl does not exist, continue
exp letter-recognition@catboost@gsx_goodall2@x20.pkl does not exist, continue
exp letter-recognition@catboost@gsx_goodall3@x20.pkl does not exist, continue
exp letter-recognition@catboost@gsx_goodall4@x20.pkl does not exist, continue
exp letter-recognition@catboost@gsx_smirnov@x20.pkl does not exist, continue
exp letter-recognition@catboos

In [7]:
cache_name = f"balance-scale@catboost@gsx_@x20.pkl"
with open(os.path.join("ppl_cache", cache_name), "rb") as f:
    stats = pickle.load(f)
    

FileNotFoundError: [Errno 2] No such file or directory: 'ppl_cache\\balance-scale@catboost@gsx_ordinal@x20.pkl'

In [14]:
for m in list(distance.AVAIL_DIST_METRICS.keys()):
    plot("balance-scale", "catboost", [m, ], plot_name=f"test-{m}.png")