### Abstract
This notebook contains examples of loading and exploring preprocessed and raw model zoos to the custom dataset class.

In [None]:
# imports
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import numpy as np
import json
import pandas as pd
import tqdm


# set environment variables to limit cpu usage
import os
os.environ["OMP_NUM_THREADS"] = "4"  # export OMP_NUM_THREADS=4
os.environ["OPENBLAS_NUM_THREADS"] = "4"  # export OPENBLAS_NUM_THREADS=4
os.environ["MKL_NUM_THREADS"] = "6"  # export MKL_NUM_THREADS=6
os.environ["VECLIB_MAXIMUM_THREADS"] = "4"  # export VECLIB_MAXIMUM_THREADS=4
os.environ["NUMEXPR_NUM_THREADS"] = "6"  # export NUMEXPR_NUM_THREADS=6
from checkpoints_to_datasets.dataset_base import ModelDatasetBase

### Load preprocessed dataset

In [None]:
dspath = Path(
    'path/to/dataset_cifar_small_hyp_fix.pt')
ds = torch.load(dspath)

### Explore Dataset

In [None]:
# the dataset contains a "trainset", "valset" and "testset"
print(ds.keys())

In [None]:
# the weights can be accessed by calling dataset.__get_weights__()
weights_test = ds['testset'].__get_weights__()

In [None]:
weights_test.shape


In [None]:
# The model properties are contained in a 'properties' dictionary
print(ds['testset'].properties.keys())

In [None]:
def load_dataset_from_path(path:Path,dstype:str,dssize:str="small",epoch_list:list=[5,15,25]):
    """
    Loads custom dataset class from raw zoo.
    input path: pathlib.Path to raw model zoo.
    input dstype: str "Seed","Hyp-fix" or "Hyp-rand" setting the dataset type
    input dssize: str "small" or "large" depending on the CNN size in the zoo
    input epoch_lst: list of integers, indicating the epochs of which to load the models 
    return dataset: dict with "trainset", "valset", "testset" 
    """
    # compose properties to map for
    result_key_list = [
        "test_acc",
        "training_iteration",
        "ggap",
    ]
    if dstype == "Seed":
        config_key_list = []
    else:
        config_key_list = [
            "model::nlin", 
            "model::init_type",
            "optim::optimizer",
            "model::dropout",
            "optim::lr",
            "optim::wd"
        ]
    property_keys = {
        "result_keys": result_key_list,
        "config_keys": config_key_list,
    }

    ## set layer list. Large model zoos require the first, small zoos the second version.
    if dssize=="large":
        layer_lst = [
            (0, "conv2d"),
            (4, "conv2d"),
            (8, "conv2d"),
            (13, "fc"),
            (16, "fc"),
        ]
    else:
        layer_lst = [
            (0, "conv2d"),
            (3, "conv2d"),
            (6, "conv2d"),
            (9, "fc"),
            (11, "fc"),
        ]
    
    # initialize ray
    import ray
    if ray.is_initialized():
        ray.shutdown()
    
    # set dataset path
    path_zoo_root = [path.absolute()]
        
    # load datasets
    # trainset
    trainset = ModelDatasetBase(
            root=path_zoo_root,
            layer_lst=layer_lst,
            epoch_lst=epoch_lst,
            mode="checkpoint",
            task="reconstruction",  # "reconstruction" (x->x), "sequence_prediction" (x^i -> x^i+1),
            use_bias=True,
            train_val_test="train",  # determines whcih dataset split to use
            ds_split=[0.7, 0.15, 0.15],  #
            max_samples=None,
            weight_threshold=5,
            filter_function=None,  # gets sample path as argument and returns True if model needs to be filtered out
            property_keys=property_keys,
            num_threads=6,
            verbosity=0,
            shuffle_path=True,
    )
    # valset
    valset = ModelDatasetBase(
            root=path_zoo_root,
            layer_lst=layer_lst,
            epoch_lst=epoch_lst,
            mode="checkpoint",
            task="reconstruction",  # "reconstruction" (x->x), "sequence_prediction" (x^i -> x^i+1),
            use_bias=True,
            train_val_test="val",  # determines whcih dataset split to use
            ds_split=[0.7, 0.15, 0.15],  #
            max_samples=None,
            weight_threshold=5,
            filter_function=None,  # gets sample path as argument and returns True if model needs to be filtered out
            property_keys=property_keys,
            num_threads=6,
            verbosity=0,
            shuffle_path=True,
    )
    # testset
    testset = ModelDatasetBase(
            root=path_zoo_root,
            layer_lst=layer_lst,
            epoch_lst=epoch_lst,
            mode="checkpoint",
            task="reconstruction",  # "reconstruction" (x->x), "sequence_prediction" (x^i -> x^i+1),
            use_bias=True,
            train_val_test="test",  # determines whcih dataset split to use
            ds_split=[0.7, 0.15, 0.15],  #
            max_samples=None,
            weight_threshold=5,
            filter_function=None,  # gets sample path as argument and returns True if model needs to be filtered out
            property_keys=property_keys,
            num_threads=6,
            verbosity=0,
            shuffle_path=True,
    )
    # put in dictionary
    dataset = {
        "trainset": trainset,
        "valset": valset,
        "testset": testset,
    }

    return dataset

In [None]:
dspath_raw = Path(
    '/path/to/tune_zoo_mnist_uniform')
dstype="Seed"
dssize="small"
# set list of epochs to load
epoch_lst = [5,15,50]
# epoch_lst = list(range(0,51))

ds_custom = load_dataset_from_path(path=dspath_raw,dstype=dstype,dssize=dssize,epoch_list=epoch_lst)