In [None]:
import os
import pandas as pd
import numpy as np

# number time series per benchmark
NB_SERIES  = 250
# Percentage train
TRAIN_SIZE = 0.20

In [None]:
def get_g_max(n_classes):
    return int(np.floor((n_classes+1)/3))

In [None]:
VAR_LENGTH_DATASETS = ["CharacterTrajectories", "SpokenArabicDigits", "JapaneseVowels"]
FIX_LENGTH_DATASETS = ["ArticularyWordRecognition", "ERing", "Plane", "Cricket", "Mallat", "UWaveGestureLibrary", "Symbols", "PenDigits", "Fungi", "NATOPS", "ECG5000"]

In [None]:
import tsmd_evaluation.benchmark_generation as benchmark_generation

columns = {'ds_name': str, 'nclasses': int, 'ndim': int , 'l_min': int, 'l_max': int, 'kappa_max' : int}
metadata = pd.DataFrame(columns, index=[])

from aeon.datasets import load_classification

path_to_benchmark = os.path.join(".", "benchmark")
if not os.path.exists(path_to_benchmark):
    os.mkdir(path_to_benchmark)


def znormalize(ts):
    ts = (ts - np.mean(ts, axis=None)) / np.std(ts, axis=None)
    return ts

for ds_name in FIX_LENGTH_DATASETS + VAR_LENGTH_DATASETS:
# for ds_name in ["Fungi"]:
    np.random.seed(0)    
    print(ds_name)
    # X, y
    X_train, y_train = load_classification(name=ds_name, split='train', load_equal_length=False)
    X_test, y_test = load_classification(name=ds_name, split='test', load_equal_length=False)

    df_train = benchmark_generation.convert_X_y_to_df(X_train, y_train)
    df_test = benchmark_generation.convert_X_y_to_df(X_test, y_test)

    # Combine, z-normalize, and resplit
    df = pd.concat((df_train, df_test)).reset_index(drop=True)
    df['ts'] = df['ts'].apply(znormalize)
    df_train = df.groupby('label', group_keys=False).apply(lambda x: x.sample(frac=TRAIN_SIZE)).sample(frac=1.0).reset_index(drop=True)
    df_test  = df.drop(df_train.index).sample(frac=1.0).reset_index(drop=True)
        
    # Generate tsmd benchmark
    classes = df['label'].unique()
    n_classes  = len(classes)
    g_max = get_g_max(n_classes)
    
    nb_train = int(TRAIN_SIZE * NB_SERIES) 
    nb_test  = NB_SERIES - nb_train

    benchmark_train = benchmark_generation.generate_tsmd_benchmark_dataset(df_train, nb_train, g_min=1, g_max=g_max)
    benchmark_test  = benchmark_generation.generate_tsmd_benchmark_dataset(df_test,  nb_test,  g_min=1, g_max=g_max)
    
    # Store the benchmark
    path_to_benchmark_dataset = os.path.join(path_to_benchmark, ds_name.lower())
    if not os.path.exists(path_to_benchmark_dataset):
        os.mkdir(path_to_benchmark_dataset) 

    benchmark_train.to_pickle(os.path.join(path_to_benchmark_dataset, 'validation.pkl'))
    benchmark_test.to_pickle(os.path.join(path_to_benchmark_dataset, 'test.pkl'))
        
    # Store metadata about the instances in the validation set
    d = df_train['ts'].iloc[0].shape[1]
    
    lengths = df_train['length'].to_numpy()
    l_min, l_max = np.min(lengths), np.max(lengths)
    
    new_row = {'ds_name': ds_name.lower(), 'nclasses': n_classes, 'ndim': d , 'l_min': l_min, 'l_max': l_max, 'g_max' : int(np.floor((n_classes+1) / 3.0))}
    metadata.loc[len(metadata)] = new_row

CharacterTrajectories


  df_train = df.groupby('label', group_keys=False).apply(lambda x: x.sample(frac=TRAIN_SIZE)).sample(frac=1.0).reset_index(drop=True)


2858
570
2288


In [None]:
metadata = metadata.reset_index(drop=True)
metadata

Unnamed: 0,ds_name,nclasses,ndim,l_min,l_max,kappa_max
0,charactertrajectories,20,3,63,180,7
