# runtime perform

In [1]:
import h5py as h5
import numpy as np
from sklearn.model_selection import train_test_split

train_sizes = [0.9, 0.7, 0.5, 0.3, 0.1]
input_datasets = [
    ['sensor',        7213,  5, True , 'y'],
    ['letter',        6450,  5, True , 'y'],
    ['bison',         5000,  5, False, 'z'],
    ['fashionMNIST',  4200,  5, True , 'y'],
    ['merchant',      3897,  5, True , 'y'],
    ['raid',          1955,  5, True , 'y'],
    ['secom',         1567,  5, True , 'y'],
    ['coil20',        1440,  5, False, 'yaw'],
    ['cnae9',         1080,  5, True , 'y'],
    ['fmd',           997,   5, True , 'y'],
    ['isomapFace',    698,   5, False, 'posex'],
    ['dendritic',     576,   5, True , 'y'],
    ['headpose',      558,   5, False, 'posex'],
]

In [2]:
import sys
from utils import *

for dname, dsize, n_stage, is_labeled, label_name in input_datasets:
    
    with h5.File(f'datasets/truth/{dname}_{dsize}.h5', 'r') as f:
        X_train=np.array(f['E'][f'X0'])
        X_test=np.array(f['O'][f'X0'])
        label_train=np.array(f['E'][f'{label_name}0'])
        label_test=np.array(f['O'][f'{label_name}0'])

    X = np.concatenate([X_train, X_test])
    label = np.concatenate([label_train, label_test])

    with h5.File(f'datasets/runtime/{dname}.h5', 'w') as f:
        print(f"Generating dataset '{dname}' with {len(train_sizes)} stages")
        ggE = f.create_group('E')
        ggO = f.create_group('O')
        for idx, train_size in enumerate(train_sizes):
            if is_labeled:
                XTrain, XTest, lTrain, lTest = train_test_split(X, label, train_size=train_size, random_state=1, stratify=label)
            else:
                XTrain, XTest, lTrain, lTest = train_test_split(X, label, train_size=train_size, random_state=0)
            ggE.create_dataset(f'{label_name}{idx}', data=lTrain)
            ggO.create_dataset(f'{label_name}{idx}', data=lTest)
            ggE.create_dataset(f'X{idx}', data=XTrain)
            ggO.create_dataset(f'X{idx}', data=XTest)
        
        print(f"  keys: {list(ggE.keys())}")

print("All Done.")



Generating dataset 'sensor' with 5 stages
  keys: ['X0', 'X1', 'X2', 'X3', 'X4', 'y0', 'y1', 'y2', 'y3', 'y4']
Generating dataset 'letter' with 5 stages
  keys: ['X0', 'X1', 'X2', 'X3', 'X4', 'y0', 'y1', 'y2', 'y3', 'y4']
Generating dataset 'bison' with 5 stages
  keys: ['X0', 'X1', 'X2', 'X3', 'X4', 'z0', 'z1', 'z2', 'z3', 'z4']
Generating dataset 'fashionMNIST' with 5 stages
  keys: ['X0', 'X1', 'X2', 'X3', 'X4', 'y0', 'y1', 'y2', 'y3', 'y4']
Generating dataset 'merchant' with 5 stages
  keys: ['X0', 'X1', 'X2', 'X3', 'X4', 'y0', 'y1', 'y2', 'y3', 'y4']
Generating dataset 'raid' with 5 stages
  keys: ['X0', 'X1', 'X2', 'X3', 'X4', 'y0', 'y1', 'y2', 'y3', 'y4']
Generating dataset 'secom' with 5 stages
  keys: ['X0', 'X1', 'X2', 'X3', 'X4', 'y0', 'y1', 'y2', 'y3', 'y4']
Generating dataset 'coil20' with 5 stages
  keys: ['X0', 'X1', 'X2', 'X3', 'X4', 'yaw0', 'yaw1', 'yaw2', 'yaw3', 'yaw4']
Generating dataset 'cnae9' with 5 stages
  keys: ['X0', 'X1', 'X2', 'X3', 'X4', 'y0', 'y1', 'y2', 