In [1]:
import os
import pickle
import pandas as pd
import random
from datasets import load_from_disk, load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# set seed
random.seed(42)

In [3]:
dataset = load_dataset(
            'cifar10'
        )
os.makedirs("../Dataset", exist_ok=True)
dataset.save_to_disk("../Dataset/CIFAR10")

Saving the dataset (1/1 shards): 100%|██████████| 50000/50000 [00:00<00:00, 124790.29 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 10000/10000 [00:00<00:00, 125723.64 examples/s]


In [4]:
train_dataset = load_from_disk(
    "../Dataset/CIFAR10/train"
)

test_dataset = load_from_disk(
    "../Dataset/CIFAR10/test"
)


In [5]:
# CIFAR2
df_train_cifar2 = pd.DataFrame()
df_train_cifar2['label'] = train_dataset['label']
df_train_cifar2 = df_train_cifar2[(df_train_cifar2['label']==3) | (df_train_cifar2['label']==5)]

df_test_cifar2 = pd.DataFrame()
df_test_cifar2['label'] = test_dataset['label']
df_test_cifar2 = df_test_cifar2[(df_test_cifar2['label']==3) | (df_test_cifar2['label']==5)]

In [6]:
cifar2_indices_dir = "./data/cifar2"
os.makedirs(cifar2_indices_dir, exist_ok=True)
train_index_cifar2 = os.path.join(cifar2_indices_dir, "idx-train.pkl")

with open(train_index_cifar2, 'wb') as handle:
    pickle.dump(df_train_cifar2.index.to_list(), handle)

test_index_cifar2 = os.path.join(cifar2_indices_dir, "idx-test.pkl")
with open(test_index_cifar2, 'wb') as handle:
    pickle.dump(df_test_cifar2.index.to_list(), handle)

In [7]:
def generate_subsets(indices, num_subsets, subset_size, output_dir):
    for i in range(num_subsets):
        subset_indices = random.sample(indices, subset_size)  # 随机采样子集
        output_path = os.path.join(output_dir, f"sub-idx-{i}.pkl")
        with open(output_path, 'wb') as handle:
            pickle.dump(subset_indices, handle)

In [8]:
output_dir = "./data/cifar2/lds_val"
os.makedirs(output_dir, exist_ok=True)
num_subsets = 256

with open("./data/cifar2/idx-train.pkl", 'rb') as handle:
    train_indices = pickle.load(handle)

train_subset_size = len(train_indices) // 2

generate_subsets(train_indices, num_subsets, train_subset_size, output_dir)

In [9]:
with open('./data/cifar2/lds_val/sub-idx-0.pkl', 'rb') as handle:
    sub_0 = pickle.load(handle)
print(sub_0[0:10])

with open('./data/cifar2/lds_val/sub-idx-1.pkl', 'rb') as handle:
    sub_1 = pickle.load(handle)
print(sub_1[0:10])


[9131, 2134, 22322, 19946, 18142, 11459, 8403, 44542, 7175, 48259]
[38011, 12722, 3337, 12630, 26438, 48836, 4207, 24627, 39957, 46807]


In [10]:
#  CIFAR10

df_train_cifar10 = pd.DataFrame()
df_train_cifar10['label'] = train_dataset['label']

df_test_cifar10 = pd.DataFrame()
df_test_cifar10['label'] = test_dataset['label']

In [11]:
print(len(df_train_cifar10))

50000


In [12]:
cifar10_indices_dir = "./data/cifar10"
os.makedirs(cifar10_indices_dir, exist_ok=True)
train_index_cifar10 = os.path.join(cifar10_indices_dir, "idx-train.pkl")
with open(train_index_cifar10, 'wb') as handle:
    pickle.dump(df_train_cifar10.index.to_list(), handle)

test_index_cifar10 = "./data/cifar10/idx-test.pkl"
with open(test_index_cifar10, 'wb') as handle:
    pickle.dump(df_test_cifar10.index.to_list(), handle)

In [13]:
output_dir = "./data/cifar10/lds_val"
os.makedirs(output_dir, exist_ok=True)
num_subsets = 256

with open("./data/cifar10/idx-train.pkl", 'rb') as handle:
    train_indices = pickle.load(handle)

train_subset_size = len(train_indices) // 2

generate_subsets(train_indices, num_subsets, train_subset_size, output_dir)