In [22]:
import os
import random
import pickle
import subprocess
import pandas as pd
from sklearn.model_selection import train_test_split


In [9]:
# set seed
random.seed(42)

In [7]:
download_path = "../Dataset/ArtBench10"
os.makedirs(download_path, exist_ok=True)

csv_url = "https://artbench.eecs.berkeley.edu/files/ArtBench-10.csv"
csv_download_path = os.path.join(download_path, "ArtBench-10.csv")
subprocess.run(["wget", "-O", csv_download_path, csv_url], check=True)

dataset_url = "https://artbench.eecs.berkeley.edu/files/artbench-10-imagefolder.tar"
dataset_download_path = os.path.join(download_path, "artbench-10-imagefolder.tar")
subprocess.run(["wget", "-O", dataset_download_path, dataset_url], check=True)
subprocess.run(["tar", "-xf", dataset_download_path, "-C", download_path], check=True)
os.remove(dataset_download_path)

print("Download and extraction completed successfully.")

--2025-02-21 19:36:32--  https://artbench.eecs.berkeley.edu/files/ArtBench-10.csv
Resolving artbench.eecs.berkeley.edu (artbench.eecs.berkeley.edu)... 128.32.37.248
Connecting to artbench.eecs.berkeley.edu (artbench.eecs.berkeley.edu)|128.32.37.248|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10959245 (10M) [text/csv]
Saving to: '../Dataset/ArtBench10/ArtBench-10.csv’

     0K .......... .......... .......... .......... ..........  0%  309K 35s
    50K .......... .......... .......... .......... ..........  0%  310K 34s
   100K .......... .......... .......... .......... ..........  1%  311K 34s
   150K .......... .......... .......... .......... ..........  1% 27.5M 26s
   200K .......... .......... .......... .......... ..........  2% 32.8M 20s
   250K .......... .......... .......... .......... ..........  2%  316K 22s
   300K .......... .......... .......... .......... ..........  3% 32.6M 19s
   350K .......... .......... .......... .......... ........

CompletedProcess(args=['wget', '-O', '../Dataset/ArtBench10/ArtBench-10.csv', 'https://artbench.eecs.berkeley.edu/files/ArtBench-10.csv'], returncode=0)

In [12]:
artbench2 = pd.read_csv('../Dataset/ArtBench10/ArtBench-10.csv')
artbench2['path'] = artbench2.apply(lambda x: "../Dataset/ArtBench10/artbench-10-imagefolder/{}/{}".format(x['label'], x['name']), axis=1)
artbench2.head()

Unnamed: 0,name,artist,url,is_public_domain,length,width,label,split,cifar_index,path
0,frank-omeara_towards-night-and-winter.jpg,frank-omeara,https://uploads5.wikiart.org/00316/images/fran...,True,800,657,impressionism,train,43186,../Dataset/ArtBench10/artbench-10-imagefolder/...
1,goldstein-grigoriy_morning.jpg,goldstein-grigoriy,https://uploads5.wikiart.org/images/grigoriy-g...,True,521,499,impressionism,train,41151,../Dataset/ArtBench10/artbench-10-imagefolder/...
2,georges-lemmen_man-reading.jpg,georges-lemmen,https://uploads6.wikiart.org/images/georges-le...,True,800,612,impressionism,train,9754,../Dataset/ArtBench10/artbench-10-imagefolder/...
3,theodor-aman_port-of-constantza-1882.jpg,theodor-aman,https://uploads6.wikiart.org/images/theodor-am...,True,560,336,impressionism,train,44244,../Dataset/ArtBench10/artbench-10-imagefolder/...
4,niccolo-cannicci_il-passo-della-futa-1914.jpg,niccolo-cannicci,https://uploads3.wikiart.org/images/niccolo-ca...,True,2400,2322,impressionism,train,46885,../Dataset/ArtBench10/artbench-10-imagefolder/...


In [20]:
train_artbench2 = artbench2[(artbench2['label'].isin(['post_impressionism', 'ukiyo_e'])) & (artbench2['split']=='train')]
train_artbench2, _ = train_test_split(train_artbench2, train_size=0.5, random_state=42, stratify=train_artbench2['label'])
train_artbench2['label'].value_counts()

label
post_impressionism    2500
ukiyo_e               2500
Name: count, dtype: int64

In [17]:
test_artbench2 = artbench2[(artbench2['label'].isin(['post_impressionism', 'ukiyo_e'])) & (artbench2['split']=='test')]
test_artbench2, _ = train_test_split(test_artbench2, train_size=0.5, random_state=42, stratify=test_artbench2['label'])
test_artbench2['label'].value_counts()


label
ukiyo_e               500
post_impressionism    500
Name: count, dtype: int64

In [23]:
# save indices
artbench2_indices_path = "./data/"
os.makedirs(artbench2_indices_path, exist_ok=True)

# train set indices
train_index_artbench2 = os.path.join(artbench2_indices_path, "idx-train.pkl")
with open(train_index_artbench2, 'wb') as handle:
    pickle.dump(train_artbench2.index.to_list(), handle)

# test set indices
test_index_artbench2 = os.path.join(artbench2_indices_path, "idx-test.pkl")
with open(test_index_artbench2, 'wb') as handle:
    pickle.dump(test_artbench2.index.to_list(), handle)

In [26]:
# generate subsets for lds validation
for k in range(256):
    tmp, _ = train_test_split(train_artbench2, train_size=0.5, random_state=42+k, stratify=train_artbench2['label'])
    filename = os.path.join('./data/lds-val/sub-idx-{}.pkl'.format(k))
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, 'wb') as handle:
        pickle.dump(tmp.index.to_list(), handle)

In [27]:
# validate indices
with open("./data/idx-train.pkl", 'rb') as handle:
    train_indices = pickle.load(handle)
print(train_indices[0:10])

with open("./data/idx-test.pkl", 'rb') as handle:
    test_indices = pickle.load(handle)
print(test_indices[0:10])

with open('./data/lds-val/sub-idx-0.pkl', 'rb') as handle:
    sub_0 = pickle.load(handle)
print(sub_0[0:10])

with open('./data/lds-val/sub-idx-1.pkl', 'rb') as handle:
    sub_1 = pickle.load(handle)
print(sub_1[0:10])

[44104, 44132, 45049, 54897, 42530, 46662, 55330, 46539, 46774, 54646]
[58519, 45032, 42761, 58081, 58630, 58718, 58338, 46951, 43851, 46195]
[55204, 59300, 55906, 59989, 55565, 44927, 59614, 54934, 44561, 59909]
[43148, 43220, 56956, 56987, 42518, 56027, 59581, 56580, 58435, 54228]
