Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion configs/sklearn/performance/tsne.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,16 @@
"x": "data/mnist_x_test.npy",
"y": "data/mnist_y_test.npy"
}
}
},
{
"source": "npy",
"name": "cifar_10",
"training":
{
"x": "data/cifar_10_x_train.npy",
"y": "data/cifar_10_y_train.npy"
}
}
],
"workload-size": "medium"
}
Expand Down
3 changes: 2 additions & 1 deletion datasets/load_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
fraud, gisette, hepmass_150K,
higgs, higgs_one_m, higgs_150K, ijcnn, klaverjas,
santander, skin_segmentation, susy)
from .loader_multiclass import (connect, covertype, covtype, letters, mlsr,
from .loader_multiclass import (cifar_10, connect, covertype, covtype, letters, mlsr,
mnist, msrank, plasticc, sensit)
from .loader_regression import (abalone, california_housing, fried, higgs_10500K,
medical_charges_nominal, mortgage_first_q,
Expand All @@ -47,6 +47,7 @@
"census": census,
"cifar_binary": cifar_binary,
"cifar_cluster": cifar_cluster,
"cifar_10": cifar_10,
"codrnanorm": codrnanorm,
"connect": connect,
"covertype": covertype,
Expand Down
36 changes: 36 additions & 0 deletions datasets/loader_multiclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,42 @@
from .loader_utils import count_lines, read_libsvm_msrank, retrieve


def cifar_10(dataset_dir: Path) -> bool:
"""
Source:
University of Toronto
Collected by Alex Krizhevsky, Vinod Nair, and Geoffrey Hinton
https://www.cs.toronto.edu/~kriz/cifar.html

Classification task. n_classes = 10
cifar_10 x train dataset (54000, 3072)
cifar_10 y train dataset (54000, 1)
cifar_10 x test dataset (6000, 3072)
cifar_10 y test dataset (6000, 1)

"""
dataset_name = 'cifar_10'
os.makedirs(dataset_dir, exist_ok=True)

X, y = fetch_openml(data_id=40927, return_X_y=True,
as_frame=False, data_home=dataset_dir)

X = pd.DataFrame(X)
y = pd.DataFrame(y)
y = y.astype(int)

logging.info(f'{dataset_name} is loaded, started parsing...')

x_train, x_test, y_train, y_test = train_test_split(
X, y, test_size=0.1, random_state=42)
for data, name in zip((x_train, x_test, y_train, y_test),
('x_train', 'x_test', 'y_train', 'y_test')):
filename = f'{dataset_name}_{name}.npy'
np.save(os.path.join(dataset_dir, filename), data)
logging.info(f'dataset {dataset_name} is ready.')
return True


def connect(dataset_dir: Path) -> bool:
"""
Source:
Expand Down