diff --git a/configs/sklearn/performance/tsne.json b/configs/sklearn/performance/tsne.json index 07909d49c..b5d705c76 100644 --- a/configs/sklearn/performance/tsne.json +++ b/configs/sklearn/performance/tsne.json @@ -23,7 +23,16 @@ "x": "data/mnist_x_test.npy", "y": "data/mnist_y_test.npy" } - } + }, + { + "source": "npy", + "name": "cifar_10", + "training": + { + "x": "data/cifar_10_x_train.npy", + "y": "data/cifar_10_y_train.npy" + } + } ], "workload-size": "medium" } diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py index 9e8ad552a..a4e4f4f4c 100644 --- a/datasets/load_datasets.py +++ b/datasets/load_datasets.py @@ -27,7 +27,7 @@ fraud, gisette, hepmass_150K, higgs, higgs_one_m, higgs_150K, ijcnn, klaverjas, santander, skin_segmentation, susy) -from .loader_multiclass import (connect, covertype, covtype, letters, mlsr, +from .loader_multiclass import (cifar_10, connect, covertype, covtype, letters, mlsr, mnist, msrank, plasticc, sensit) from .loader_regression import (abalone, california_housing, fried, higgs_10500K, medical_charges_nominal, mortgage_first_q, @@ -47,6 +47,7 @@ "census": census, "cifar_binary": cifar_binary, "cifar_cluster": cifar_cluster, + "cifar_10": cifar_10, "codrnanorm": codrnanorm, "connect": connect, "covertype": covertype, diff --git a/datasets/loader_multiclass.py b/datasets/loader_multiclass.py index 34033a714..874db9939 100644 --- a/datasets/loader_multiclass.py +++ b/datasets/loader_multiclass.py @@ -28,6 +28,42 @@ from .loader_utils import count_lines, read_libsvm_msrank, retrieve +def cifar_10(dataset_dir: Path) -> bool: + """ + Source: + University of Toronto + Collected by Alex Krizhevsky, Vinod Nair, and Geoffrey Hinton + https://www.cs.toronto.edu/~kriz/cifar.html + + Classification task. n_classes = 10 + cifar_10 x train dataset (54000, 3072) + cifar_10 y train dataset (54000, 1) + cifar_10 x test dataset (6000, 3072) + cifar_10 y test dataset (6000, 1) + + """ + dataset_name = 'cifar_10' + os.makedirs(dataset_dir, exist_ok=True) + + X, y = fetch_openml(data_id=40927, return_X_y=True, + as_frame=False, data_home=dataset_dir) + + X = pd.DataFrame(X) + y = pd.DataFrame(y) + y = y.astype(int) + + logging.info(f'{dataset_name} is loaded, started parsing...') + + x_train, x_test, y_train, y_test = train_test_split( + X, y, test_size=0.1, random_state=42) + for data, name in zip((x_train, x_test, y_train, y_test), + ('x_train', 'x_test', 'y_train', 'y_test')): + filename = f'{dataset_name}_{name}.npy' + np.save(os.path.join(dataset_dir, filename), data) + logging.info(f'dataset {dataset_name} is ready.') + return True + + def connect(dataset_dir: Path) -> bool: """ Source: