In [5]:
import os
from datasets import load_dataset, DatasetDict
from PIL import Image
from pathlib import Path
from tqdm import tqdm

# Paths
scratch_dir = os.path.expandvars("/scratch/$USER")
hf_cache_dir = os.path.join(scratch_dir, "hf_cache")
sun397_dir = os.path.join(scratch_dir, "SUN397")

# Make sure directories exist
os.makedirs(hf_cache_dir, exist_ok=True)
os.makedirs(sun397_dir, exist_ok=True)


## get meta data

In [2]:
# Load metadata only (not streaming)
dataset_train = load_dataset(
    "tanganke/sun397",
    split="train",
    cache_dir=hf_cache_dir
)

dataset_test = load_dataset(
    "tanganke/sun397",
    split="test",
    cache_dir=hf_cache_dir
)




Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/18 [00:00<?, ?files/s]

data/train-00017-of-00018.parquet:   0%|          | 0.00/457M [00:00<?, ?B/s]

Downloading data:   0%|          | 0/18 [00:00<?, ?files/s]

data/test-00000-of-00018.parquet:   0%|          | 0.00/450M [00:00<?, ?B/s]

data/test-00001-of-00018.parquet:   0%|          | 0.00/365M [00:00<?, ?B/s]

data/test-00002-of-00018.parquet:   0%|          | 0.00/353M [00:00<?, ?B/s]

data/test-00003-of-00018.parquet:   0%|          | 0.00/451M [00:00<?, ?B/s]

data/test-00004-of-00018.parquet:   0%|          | 0.00/467M [00:00<?, ?B/s]

data/test-00005-of-00018.parquet:   0%|          | 0.00/409M [00:00<?, ?B/s]

data/test-00006-of-00018.parquet:   0%|          | 0.00/463M [00:00<?, ?B/s]

data/test-00007-of-00018.parquet:   0%|          | 0.00/408M [00:00<?, ?B/s]

data/test-00008-of-00018.parquet:   0%|          | 0.00/365M [00:00<?, ?B/s]

data/test-00009-of-00018.parquet:   0%|          | 0.00/397M [00:00<?, ?B/s]

data/test-00010-of-00018.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

data/test-00011-of-00018.parquet:   0%|          | 0.00/360M [00:00<?, ?B/s]

data/test-00012-of-00018.parquet:   0%|          | 0.00/348M [00:00<?, ?B/s]

data/test-00013-of-00018.parquet:   0%|          | 0.00/388M [00:00<?, ?B/s]

data/test-00014-of-00018.parquet:   0%|          | 0.00/401M [00:00<?, ?B/s]

data/test-00015-of-00018.parquet:   0%|          | 0.00/426M [00:00<?, ?B/s]

data/test-00016-of-00018.parquet:   0%|          | 0.00/424M [00:00<?, ?B/s]

data/test-00017-of-00018.parquet:   0%|          | 0.00/366M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/19850 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/19850 [00:00<?, ? examples/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

In [3]:
print(dataset_train)
print(dataset_test)

Dataset({
    features: ['image', 'label'],
    num_rows: 19850
})
Dataset({
    features: ['image', 'label'],
    num_rows: 19850
})


# small data subset

## create a small subset 

In [10]:
from datasets import load_dataset

n = 100

# Load a small subset using f-strings
dataset_train = load_dataset("tanganke/sun397", split=f"train[:{n}]", cache_dir=hf_cache_dir)
dataset_test  = load_dataset("tanganke/sun397", split=f"test[:{n}]", cache_dir=hf_cache_dir)

# Get mapping from integer labels → scene names
label_names = dataset_train.features["label"].names
print(label_names[:10])  # check first 10 class names


Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

['abbey', 'airplane cabin', 'airport terminal', 'alley', 'amphitheater', 'amusement arcade', 'amusement park', 'anechoic chamber', 'apartment building outdoor', 'apse indoor']


## download small subset

In [13]:


def save_subset_images(dataset, split_name, target_dir, label_names):
    split_dir = Path(target_dir) / split_name
    split_dir.mkdir(parents=True, exist_ok=True)
    
    for idx, item in enumerate(tqdm(dataset)):
        img = item["image"]
        
        # Convert RGBA or other modes to RGB for JPEG
        if img.mode != "RGB":
            img = img.convert("RGB")
        
        label_idx = item["label"]
        label = label_names[label_idx].replace("/", "_")
        
        label_dir = split_dir / label
        label_dir.mkdir(exist_ok=True)
        
        img_path = label_dir / f"{idx}.jpg"
        if not img_path.exists():
            img.save(img_path, format="JPEG")

# Save small subsets
save_subset_images(dataset_train, "train", sun397_dir, label_names)
save_subset_images(dataset_test, "test", sun397_dir, label_names)


100%|██████████| 100/100 [00:00<00:00, 170.82it/s]
100%|██████████| 100/100 [00:01<00:00, 90.74it/s]


# whole dataset

## Load full dataset (train and test)

In [2]:
dataset_train = load_dataset("tanganke/sun397", split="train", cache_dir=hf_cache_dir)
dataset_test = load_dataset("tanganke/sun397", split="test", cache_dir=hf_cache_dir)

label_names = dataset_train.features["label"].names
print(f"Loaded dataset: {len(dataset_train)} train + {len(dataset_test)} test samples")


Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Loaded dataset: 19850 train + 19850 test samples


## Function to save images robustly

In [6]:
def save_sun397_images(dataset, split_name, target_dir, label_names):
    split_dir = Path(target_dir) / split_name
    split_dir.mkdir(parents=True, exist_ok=True)
    
    for idx, item in enumerate(tqdm(dataset, desc=f"Saving {split_name}")):
        img = item["image"]
        label_idx = item["label"]
        label = label_names[label_idx].replace("/", "_")
        
        label_dir = split_dir / label
        label_dir.mkdir(exist_ok=True)
        
        img_path = label_dir / f"{idx}.jpg"
        
        # Skip already existing files (useful for restarts)
        if img_path.exists():
            continue
        
        try:
            # Convert to RGB if necessary (JPEG-safe)
            if img.mode not in ("RGB", "L"):
                img = img.convert("RGB")
            img.save(img_path, format="JPEG")
        
        except (OSError, UnidentifiedImageError) as e:
            # Handle corrupt or unsupported images gracefully
            print(f"⚠️ Skipping image {idx} ({split_name}) due to error: {e}")
            continue


## Save both splits

In [7]:
save_sun397_images(dataset_train, "train", sun397_dir, label_names)
save_sun397_images(dataset_test, "test", sun397_dir, label_names)

print("✅ All images processed and saved successfully!")

Saving train: 100%|██████████| 19850/19850 [55:13<00:00,  5.99it/s]  
Saving test: 100%|██████████| 19850/19850 [10:38<00:00, 31.09it/s] 

✅ All images processed and saved successfully!



