## Dataset

### VOC

In [None]:
import torch
from torch.utils.data import Subset
from torchvision import datasets, transforms

In [None]:
from torchvision.datasets import VOCSegmentation

voc2012_detection_train = VOCSegmentation(
    root='./data',           # 資料集存放目錄
    year='2012',            # 年份設定為2012
    image_set='train',      # 可選擇 'train', 'trainval', 'val'
    download=True,          # 自動下載
    transform=None,         # 可選的圖像變換
    target_transform=None   # 可選的標籤變換
)

# 選擇前500筆VOC資料
voc_subset = Subset(voc2012_detection_train, list(range(240)))

voc2012_detection_val = VOCSegmentation(
    root='./data',           # 資料集存放目錄
    year='2012',            # 年份設定為2012
    image_set='val',      # 可選擇 'train', 'trainval', 'val'
    download=True,          # 自動下載
    transform=None,         # 可選的圖像變換
    target_transform=None   # 可選的標籤變換
)

voc_subset = Subset(voc2012_detection_val, list(range(60)))

### Imagenette

In [None]:
import torch
from torch.utils.data import Subset
from torchvision import datasets, transforms
import urllib.request
import tarfile
import os

# Create directory if it doesn't exist
if not os.path.exists('./data/imagenette2-160'):
    # Download Imagenette-160
    url = 'https://s3.amazonaws.com/fast-ai-imageclas/imagenette2-160.tgz'
    urllib.request.urlretrieve(url, './data/imagenette2-160.tgz')
    
    # Extract the dataset
    with tarfile.open('./data/imagenette2-160.tgz', 'r:gz') as tar:
        tar.extractall('./data')

# Load training data
imagenette_train = datasets.ImageFolder(
    root='./data/imagenette2-160/train',
    transform=None  # You can add transforms here if needed
)

# Load validation data
imagenette_val = datasets.ImageFolder(
    root='./data/imagenette2-160/val',
    transform=None  # You can add transforms here if needed
)

# Create subsets with desired sizes
imagenette_train_subset = Subset(imagenette_train, list(range(240)))
imagenette_val_subset = Subset(imagenette_val, list(range(60)))

# Print dataset sizes
print(f"Training subset size: {len(imagenette_train_subset)}")
print(f"Validation subset size: {len(imagenette_val_subset)}")

### COCO

In [None]:
from pycocotools.coco import COCO
import requests
import os
import shutil
import zipfile

def download_coco_subset(split="train", num_samples=240):
    # Create directories
    base_dir = "./data/coco_subset"
    images_dir = os.path.join(base_dir, f"{split}2017")
    annot_dir = os.path.join(base_dir, "annotations")
    os.makedirs(images_dir, exist_ok=True)
    os.makedirs(annot_dir, exist_ok=True)
    
    # Download and extract annotations if not already present
    annotation_zip = os.path.join(base_dir, "annotations.zip")
    if not os.path.exists(os.path.join(annot_dir, "instances_train2017.json")):
        # Download annotations
        print("Downloading COCO annotations...")
        annotation_url = "http://images.cocodataset.org/annotations/annotations_trainval2017.zip"
        response = requests.get(annotation_url, stream=True)
        with open(annotation_zip, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        
        # Extract annotations
        print("Extracting annotations...")
        with zipfile.ZipFile(annotation_zip, 'r') as zip_ref:
            zip_ref.extractall(base_dir)
        
        # Clean up zip file
        os.remove(annotation_zip)
    
    # Classes we want to download
    categories = ["person", "car", "bicycle", "motorcycle", "airplane", 
                 "bus", "train", "truck", "boat", "traffic light"]
    
    # Initialize COCO API with local annotation file
    annotation_file = os.path.join(annot_dir, f"instances_{split}2017.json")
    coco = COCO(annotation_file)
    
    # Get category IDs
    cat_ids = coco.getCatIds(catNms=categories)
    
    # Get image IDs for these categories
    img_ids = []
    for cat_id in cat_ids:
        img_ids.extend(coco.getImgIds(catIds=[cat_id]))
    img_ids = list(set(img_ids))[:num_samples]  # Remove duplicates and limit samples
    
    # Download images
    print(f"Downloading {len(img_ids)} images for {split} set...")
    for i, img_id in enumerate(img_ids):
        img = coco.loadImgs([img_id])[0]
        img_url = img['coco_url']
        file_name = img['file_name']
        
        # Download image if it doesn't exist
        img_path = os.path.join(images_dir, file_name)
        if not os.path.exists(img_path):
            response = requests.get(img_url)
            if response.status_code == 200:
                with open(img_path, 'wb') as f:
                    f.write(response.content)
        
        if (i + 1) % 10 == 0:
            print(f"Downloaded {i + 1}/{len(img_ids)} images")

# Download train and validation sets
print("Downloading training set...")
download_coco_subset(split="train", num_samples=240)
print("\nDownloading validation set...")
download_coco_subset(split="val", num_samples=60)

Downloading training set...
Downloading COCO annotations...
Extracting annotations...
loading annotations into memory...
Done (t=10.30s)
creating index...
index created!
Downloading 240 images for train set...
Downloaded 10/240 images
Downloaded 20/240 images
Downloaded 30/240 images
Downloaded 40/240 images
Downloaded 50/240 images
Downloaded 60/240 images
Downloaded 70/240 images
Downloaded 80/240 images
Downloaded 90/240 images
Downloaded 100/240 images
Downloaded 110/240 images
Downloaded 120/240 images
Downloaded 130/240 images
Downloaded 140/240 images
Downloaded 150/240 images
Downloaded 160/240 images
Downloaded 170/240 images
Downloaded 180/240 images
Downloaded 190/240 images
Downloaded 200/240 images
Downloaded 210/240 images
Downloaded 220/240 images
Downloaded 230/240 images
Downloaded 240/240 images

Downloading validation set...
loading annotations into memory...
Done (t=0.31s)
creating index...
index created!
Downloading 60 images for val set...
Downloaded 10/60 images
