## ***STRATIFIED SUBSET***

The original `Places365 dataset`, with approximately 1.8 million images just in the training, gave us some problems due to its size and computational demands. To address these limitations, we opted for a stratified subsetting of the dataset. This subsampling process aimed to retain the diversity of the dataset while making it feasible for training and evaluation within our computational constraints.

We selected 20 classes from the dataset, 4,000 images in total (200 per class). These classes were carefully chosen to ensure a balance between indoor and outdoor scenes, natural and artificial environments, and functional and recreational spaces. The subset was further split into 60/20/20 proportions for training, validation, and testing, aligning with standard practices for machine learning experiments.

### ***Libraries and dependencies***

In [None]:
import numpy as np
import random
import torch
import os
import shutil
from sklearn.model_selection import train_test_split

In [None]:
# Set a random seed for reproducibility

SEED = 42
np.random.seed(SEED)
random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)


In [None]:
# Define the selected classes to take from PLaces 365
SELECTED_CLASSES = [
    "airport_terminal", "amusement_park", "aquarium", "ballroom", "barn",
    "beach", "bedroom", "bridge", "canyon", "castle", "church_outdoor",
    "forest_path", "highway", "kitchen", "library_indoor", "mountain",
    "restaurant", "skyscraper", "stadium_soccer", "swimming_pool_outdoor"
]

def stratified_subsampling(subset_dir, output_dir, sample_size_per_class):
    """
    Perform stratified subsampling for the selected classes.

    Args:
        subset_dir (str): Path to the subset directory.
        output_dir (str): Directory to store the subsampled dataset.
        sample_size_per_class (int): Number of samples to retain per class.

    Returns:
        None
    """
    os.makedirs(output_dir, exist_ok=True)

    for category in os.listdir(subset_dir):
        if category not in SELECTED_CLASSES:
            continue  # Skip categories not in the selected list

        category_path = os.path.join(subset_dir, category)
        if not os.path.isdir(category_path):
            continue

        files = [f for f in os.listdir(category_path) if os.path.isfile(os.path.join(category_path, f))]
        sampled_files = files[:sample_size_per_class]  # Take the first N samples

        category_output_path = os.path.join(output_dir, category)
        os.makedirs(category_output_path, exist_ok=True)

        for file in sampled_files:
            shutil.copy(os.path.join(category_path, file), os.path.join(category_output_path, file))

    print("Stratified subsampling completed for selected classes.")

Stratified subsampling completed for selected classes.


In [None]:
#Let's create the subset

subset_dir = "/Users/damianzeller/Desktop/HS24/AML/Final_Project/Data/train_256"
output_dir = '/Users/damianzeller/Desktop/HS24/AML/Final_Project/Data/subset_only_20'
sample_size_per_class = 200

stratified_subsampling(subset_dir, output_dir, sample_size_per_class)

After creating the subset we will check and explore it a bit:

In [None]:
def explore_file_system(output_dir):
    """
    Explore the file system to determine the number of categories,
    the image count in each category, and the total image count.

    Args:
        output_dir (str): Path to the restructured dataset directory.

    Returns:
        tuple:
            - dict: Dictionary with category names as keys and the count of images as values.
            - int: Total number of images.
    """
    categories = {}
    total_images = 0

    for category_folder in os.listdir(output_dir):
        category_path = os.path.join(output_dir, category_folder)

        if not os.path.isdir(category_path):
            continue

        # Count images in the category folder
        image_count = sum([1 for file in os.listdir(category_path)
                           if os.path.isfile(os.path.join(category_path, file))])
        categories[category_folder] = image_count
        total_images += image_count

    return categories, total_images

# Specify the path to the output directory

# Explore the file system and display results
categories_info, total_images = explore_file_system('/Users/damianzeller/Desktop/HS24/AML/Final_Project/Data/subset_only_20_train')
print(f"Total Categories: {len(categories_info)}")
for category, count in categories_info.items():
    print(f"Category: {category}, Image Count: {count}")
print(f"Total Images: {total_images}")


Total Categories: 20
Category: bedroom, Image Count: 120
Category: restaurant, Image Count: 120
Category: library_indoor, Image Count: 120
Category: canyon, Image Count: 120
Category: highway, Image Count: 120
Category: bridge, Image Count: 120
Category: church_outdoor, Image Count: 120
Category: stadium_soccer, Image Count: 120
Category: airport_terminal, Image Count: 120
Category: ballroom, Image Count: 120
Category: amusement_park, Image Count: 120
Category: barn, Image Count: 120
Category: skyscraper, Image Count: 120
Category: forest_path, Image Count: 120
Category: beach, Image Count: 120
Category: castle, Image Count: 120
Category: mountain, Image Count: 120
Category: swimming_pool_outdoor, Image Count: 120
Category: kitchen, Image Count: 120
Category: aquarium, Image Count: 120
Total Images: 2400


Now that we checked the subset we can proceed with the 60/20/20 split into training, validation, and test:

In [None]:
def split_train_val_test(subset_dir, train_dir, val_dir, test_dir, train_ratio=0.6, val_ratio=0.2, test_ratio=0.2):
    """
    Split the subset directory into train, validation, and test sets.

    Args:
        subset_dir (str): Path to the subset directory.
        train_dir (str): Directory to store the training set.
        val_dir (str): Directory to store the validation set.
        test_dir (str): Directory to store the test set.
        train_ratio (float): Proportion of data to use for training.
        val_ratio (float): Proportion of data to use for validation.
        test_ratio (float): Proportion of data to use for testing.

    Returns:
        None
    """
    assert train_ratio + val_ratio + test_ratio == 1.0, "Ratios must sum to 1.0"

    # Create target directories if they don't exist
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(val_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)

    for category in os.listdir(subset_dir):
        category_path = os.path.join(subset_dir, category)
        if not os.path.isdir(category_path):
            continue

        files = [f for f in os.listdir(category_path) if os.path.isfile(os.path.join(category_path, f))]
        train_files, temp_files = train_test_split(files, train_size=train_ratio, random_state=SEED)
        val_files, test_files = train_test_split(temp_files, train_size=val_ratio / (val_ratio + test_ratio), random_state=42)

        # Copy files to respective directories
        for dataset, dataset_files in zip([train_dir, val_dir, test_dir], [train_files, val_files, test_files]):
            category_dataset_path = os.path.join(dataset, category)
            os.makedirs(category_dataset_path, exist_ok=True)
            for file in dataset_files:
                shutil.copy(os.path.join(category_path, file), os.path.join(category_dataset_path, file))

    print("Dataset split into train, validation, and test sets.")

Dataset split into train, validation, and test sets.


In [None]:
#Let's split the subset into validation, train and test:

subset_dir = output_dir
train_dir = '/Users/damianzeller/Desktop/HS24/AML/Final_Project/Data/subset_only_20_train'
val_dir = '/Users/damianzeller/Desktop/HS24/AML/Final_Project/Data/subset_only_20_val'
test_dir = '/Users/damianzeller/Desktop/HS24/AML/Final_Project/Data/subset_only_20_test'

split_train_val_test(subset_dir, train_dir, val_dir, test_dir)