# Data Shuffling and Stratified Train-Val-Test Split Sample Implementation

### Import Libraries

In [1]:
import os
import numpy as np
import pandas as pd
from PIL import Image
from io import StringIO
from matplotlib import pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torchvision.utils import make_grid
from torch.utils.data import Dataset, DataLoader, Subset
from sklearn.model_selection import train_test_split

  warn(f"Failed to load image Python extension: {e}")


### Define Variables & Helper Functions

In [2]:
# Set seed for random number generation to create reproducible results
random_seed = 5
torch.manual_seed(random_seed)

<torch._C.Generator at 0x28ab84ddc90>

In [3]:
# Transformations to apply to inputs
preprocess = transforms.Compose([
    # Convert PIL Image to tensor and scale to [0, 1] through max normalization
    # (i.e. for every pixel in image, new_pixel_value = pixel/255)
    transforms.ToTensor()
])

In [4]:
# Helper function to get ground truth class of an image
def get_img_labels(img_dir):
    labels = ''
    
    for filename in os.listdir(img_dir):
        # If image has no metastasis
        if (filename[5] == '0'):
            labels += filename + ",0\n"
            
        # If image has metastasis
        else:
            labels += filename + ",1\n"
        
    return labels

### Load Data

In [5]:
# Image dataset
class CustomDataset(Dataset):
    def __init__(self, img_dir, transform):
        self.img_dir = img_dir
        self.img_labels = pd.read_csv(StringIO(get_img_labels(img_dir)), sep=",", header=None)
        self.transform = transform

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        # Generate image filepath
        filename = self.img_labels.iloc[idx, 0]
        img_path = self.img_dir + "/" + filename
        
        # Read and transform image
        image = Image.open(img_path).convert("RGB")
        image = self.transform(image)
        
        # Determine ground truth class (metastasis or no metastasis)
        label = self.img_labels.iloc[idx, 1]
        return image, label, filename

In [6]:
# Dataset and dataloader for training data
dataset = CustomDataset(img_dir=r"dataset-augmented", transform=preprocess)

### Data Shuffling & Stratified Train-Val-Test Split

In [7]:
# Get classes and indices of images
labels = list(dataset.img_labels.iloc[:, 1])
indices = np.arange(len(labels))

In [8]:
# Split dataset into 70% training, 10% validation, and 20% test with shuffle and stratified split
def get_set_indices(random_seed):
    # Split dataset into training and test sets
    train_indices, test_indices = train_test_split(indices, 
                                           train_size=0.8, 
                                           random_state=random_seed,
                                           shuffle=True, 
                                           stratify=labels)

    # Split training set into training and validation sets
    train_labels = [labels[i] for i in train_indices]
    train_indices, val_indices = train_test_split(train_indices, 
                                          train_size=0.875,
                                          random_state=random_seed, 
                                          shuffle=True, 
                                          stratify=train_labels)

    # Check that dataset was split into 70% training, 10% validation, and 20% test
    print(f"Images in training set: {len(train_indices)} ({(len(train_indices)*100/len(indices)):.2f}% of total dataset)")
    print(f"Images in validation set: {len(val_indices)} ({(len(val_indices)*100/len(indices)):.2f}% of total dataset)")
    print(f"Images in test set: {len(test_indices)} ({(len(test_indices)*100/len(indices)):.2f}% of total dataset)")
    print("-" * 10)
    print(f"Total images in dataset: {len(indices)}")
    
    return train_indices, val_indices, test_indices

In [9]:
# Check that training, validation, and test sets have the same ratio of non-metastasis to metastasis as the whole dataset
def check_set_ratios(train_indices, test_indices, val_indices):
    train_labels = [labels[i] for i in train_indices]
    val_labels = [labels[i] for i in val_indices]
    test_labels = [labels[i] for i in test_indices]

    train_classes, train_counts = np.unique(train_labels, return_counts=True)
    val_classes, val_counts = np.unique(val_labels, return_counts=True)
    test_classes, test_counts = np.unique(test_labels, return_counts=True)
    dataset_classes, dataset_counts = np.unique(labels, return_counts=True)

    print(f"Classes in training set:")
    print(f"{train_counts[0]} without metastasis ({(train_counts[0]*100/len(train_indices)):.2f}% of training set)")
    print(f"{train_counts[1]} with metastasis ({(train_counts[1]*100/len(train_indices)):.2f}% of training set)\n")
    print(f"Classes in validation set:")
    print(f"{val_counts[0]} without metastasis ({(val_counts[0]*100/len(val_indices)):.2f}% of validation set)")
    print(f"{val_counts[1]} with metastasis ({(val_counts[1]*100/len(val_indices)):.2f}% of validation set)\n")
    print(f"Classes in test set:")
    print(f"{test_counts[0]} without metastasis ({(test_counts[0]*100/len(test_indices)):.2f}% of test set)")
    print(f"{test_counts[1]} with metastasis ({(test_counts[1]*100/len(test_indices)):.2f}% of test set)")
    print("-" * 10)
    print(f"Total classes in dataset:")
    print(f"{dataset_counts[0]} without metastasis ({(dataset_counts[0]*100/len(indices)):.2f}% of dataset)")
    print(f"{dataset_counts[1]} with metastasis ({(dataset_counts[1]*100/len(indices)):.2f}% of dataset)")

In [10]:
# Get training, validation, and test sets
def get_sets(train_indices, val_indices, test_indices):
    train_set = Subset(dataset, train_indices)
    val_set = Subset(dataset, val_indices)
    test_set = Subset(dataset, test_indices)
    
    return train_set, val_set, test_set

In [11]:
# Load training, validation, and test sets
def get_loaders(train_set, val_set, test_set, batch_size):
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=True)
    
    return train_loader, val_loader, test_loader

### Verify Data Shuffling

In [None]:
epochs = 3
batch_size = 10

# Sample code to verify that data is shuffled into test/training/val across multiple epochs
# Ex. In epoch 1, image A can be in training; in epoch 2, image A can be in test; etc.
for epoch in range(epochs):
    print(f"Epoch: {epoch}/{epochs-1}")
    
    random_seed += 1
    
    train_indices, val_indices, test_indices = get_set_indices(random_seed)
    check_set_ratios(train_indices, val_indices, test_indices)
    train_set, val_set, test_set = get_sets(train_indices, val_indices, test_indices)
    train_loader, val_loader, test_loader = get_loaders(train_set, val_set, test_set, batch_size)
    
    for batch_index, batch_data in enumerate(train_loader):
        # Get the inputs
        batch_images, batch_labels, batch_filenames = batch_data
        
        # Print labels in first 5 batches of current training set
        if (batch_index < 5):
            print(f"Training Batch {batch_index} Classes: " + ' '.join(f'{batch_labels[j]}' for j in range(len(batch_labels))))
        
        # Verify if a chosen image is in test, train, or val to verify 
        # if data points can be in different sets across different epochs
        if ('0000-0-A.tif' in batch_filenames):
            print("Image 0000-0-A.tif found in training set")
    
    for batch_index, batch_data in enumerate(val_loader):
        # Get the inputs
        batch_images, batch_labels, batch_filenames = batch_data
        
        # Verify if a chosen image is in test, train, or val to verify 
        # if data points can be in different sets across different epochs
        if ('0000-0-A.tif' in batch_filenames):
            print("Image 0000-0-A.tif found in validation set")
    
    for batch_index, batch_data in enumerate(test_loader):
        # Get the inputs
        batch_images, batch_labels, batch_filenames = batch_data
        
        # Verify if a chosen image is in test, train, or val to verify 
        # if data points can be in different sets across different epochs
        if ('0000-0-A.tif' in batch_filenames):
            print("Image 0000-0-A.tif found in test set")
    
    print("-" * 50)

Epoch: 0/2
Images in training set: 1715 (69.97% of total dataset)
Images in validation set: 245 (10.00% of total dataset)
Images in test set: 491 (20.03% of total dataset)
----------
Total images in dataset: 2451
Classes in training set:
1117 without metastasis (65.13% of training set)
598 with metastasis (34.87% of training set)

Classes in validation set:
320 without metastasis (65.17% of validation set)
171 with metastasis (34.83% of validation set)

Classes in test set:
160 without metastasis (65.31% of test set)
85 with metastasis (34.69% of test set)
----------
Total classes in dataset:
1597 without metastasis (65.16% of dataset)
854 with metastasis (34.84% of dataset)
Training Batch 0 Classes: 1 0 0 1 0 0 0 1 0 0
Training Batch 1 Classes: 1 0 0 1 0 0 1 0 0 1
Training Batch 2 Classes: 0 1 0 1 0 1 1 1 0 0
Training Batch 3 Classes: 0 0 0 1 1 0 0 0 0 1
Training Batch 4 Classes: 1 0 1 0 1 0 0 0 0 0
Image 0000-0-A.tif found in training set
---------------------------------------------