In [None]:
import os
import urllib.request
import tarfile
from pathlib import Path


In [7]:
from pathlib import Path

# Use external storage as "home"
CUSTOM_HOME = Path("~/gpufs").expanduser()

DATA_ROOT = CUSTOM_HOME / "data" / "dtd"
RAW_DIR = CUSTOM_HOME / "data" / "dtd_raw"
TAR_PATH = CUSTOM_HOME / "data" / "dtd.tar.gz"
URL = "http://www.robots.ox.ac.uk/~vgg/data/dtd/download/dtd-r1.0.1.tar.gz"


In [None]:
# --- Step 1: Download ---
DATA_ROOT.mkdir(parents=True, exist_ok=True)
if not TAR_PATH.exists():
    print(f"Downloading DTD dataset to {TAR_PATH}...")
    urllib.request.urlretrieve(URL, TAR_PATH)
    print("Download complete.")

Downloading DTD dataset to /home/guyb/gpufs/data/dtd.tar.gz...


In [None]:
# --- Step 2: Extract ---
if not RAW_DIR.exists():
    print("Extracting dataset...")
    with tarfile.open(TAR_PATH) as tar:
        tar.extractall(path=RAW_DIR.parent)
    RAW_DIR.rename(RAW_DIR)  # rename extracted `dtd/` to `dtd_raw/`
    print("Extraction complete.")

In [None]:
# --- Step 3: Organize splits ---
for split in ['train', 'val', 'test']:
    split_file = RAW_DIR / "labels" / f"{split}1.txt"
    split_dir = DATA_ROOT / split
    split_dir.mkdir(parents=True, exist_ok=True)

    with open(split_file) as f:
        for line in f:
            rel_path = line.strip()
            class_name = rel_path.split("/")[0]
            class_dir = split_dir / class_name
            class_dir.mkdir(parents=True, exist_ok=True)

            src = RAW_DIR / "images" / rel_path + ".jpg"
            dst = class_dir / Path(rel_path).name
            if not dst.exists():
                os.symlink(src, dst)

print(f"✅ DTD is ready at {DATA_ROOT}")