## Generating Validation Set Splits

In [1]:
import torch
from torchvision import datasets
from torchvision import transforms
from torch.utils.data import DataLoader

batch_size = 64

### A Typical Dataset (here: MNIST)

In [3]:
train_dataset = datasets.MNIST(root='D:/work/data/Python/mnist/', 
                               train=True, 
                               transform=transforms.ToTensor(),
                               download=True)

test_dataset = datasets.MNIST(root='D:/work/data/Python/mnist/', 
                              train=False, 
                              transform=transforms.ToTensor())


train_loader = DataLoader(dataset=train_dataset, 
                          batch_size=batch_size,
                          num_workers=0,
                          shuffle=True)

test_loader = DataLoader(dataset=test_dataset, 
                         batch_size=batch_size,
                         num_workers=0,
                         shuffle=False)

for images, labels in train_loader:  
    print('Image batch dimensions:', images.shape)
    print('Image label dimensions:', labels.shape)
    break

Image batch dimensions: torch.Size([64, 1, 28, 28])
Image label dimensions: torch.Size([64])


### 1. Subset Method

In [4]:
from torch.utils.data.dataset import Subset

In [5]:
valid_indices = torch.arange(0, 1000)
train_indices = torch.arange(1000, 60000)

train_and_valid = datasets.MNIST(root='D:/work/data/Python/mnist/', 
                               train=True, 
                               transform=transforms.ToTensor(),
                               download=True)

train_dataset = Subset(train_and_valid, train_indices)
valid_dataset = Subset(train_and_valid, valid_indices)

train_loader = DataLoader(train_dataset,
                           batch_size=batch_size,
                           num_workers=0,
                           shuffle=True)
test_loader = DataLoader(test_dataset,
                          batch_size=batch_size,
                          num_workers=0,
                          shuffle=True)

for images, labels in train_loader:
    print('Image batch dimensions:', images.shape)
    print('Image label dimensions:', labels.shape)
    break

Image batch dimensions: torch.Size([64, 1, 28, 28])
Image label dimensions: torch.Size([64])


In [6]:
for images, labels in train_loader:  
    pass
print(labels[:10])

for images, labels in train_loader:  
    pass
print(labels[:10])


tensor([1, 6, 6, 6, 8, 5, 8, 1, 3, 9])
tensor([5, 2, 6, 0, 4, 1, 5, 4, 2, 9])


In [7]:
torch.manual_seed(123)
for images, labels in train_loader:  
    pass
print(labels[:10])

torch.manual_seed(123)
for images, labels in train_loader:  
    pass
print(labels[:10])

tensor([1, 0, 3, 7, 0, 7, 5, 6, 8, 3])
tensor([1, 0, 3, 7, 0, 7, 5, 6, 8, 3])


### 2. SubsetRandomSampler Method

In [8]:
from torch.utils.data import SubsetRandomSampler

In [10]:
train_indices = torch.arange(1000, 60000)
valid_indices = torch.arange(0, 1000)

train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(valid_indices)

training_transform = transforms.Compose([transforms.Resize((32, 32)),
                                         transforms.RandomCrop((28, 28)),
                                         transforms.ToTensor()])

valid_transform = transforms.Compose([transforms.Resize((32, 32)),
                                         transforms.CenterCrop((28, 28)),
                                         transforms.ToTensor()])

# train_dataset和valid_dataset的数据都是来自train数据集，但是使用不同transform
# test_dataset来自test数据集
train_dataset = datasets.MNIST(root='D:/work/data/Python/mnist/', 
                               train=True, 
                               transform=training_transform,
                               download=True)

valid_dataset = datasets.MNIST(root='D:/work/data/Python/mnist/', 
                               train=True, 
                               transform=valid_transform,
                               download=False)

test_dataset = datasets.MNIST(root='D:/work/data/Python/mnist/', 
                              train=False, 
                              transform=valid_transform,
                              download=False)

train_loader = DataLoader(train_dataset,
                          batch_size=batch_size,
                          num_workers=0,
                          sampler=train_sampler)
valid_loader = DataLoader(valid_dataset,
                          batch_size=batch_size,
                          num_workers=0,
                          sampler=valid_sampler)

test_loader = DataLoader(dataset=test_dataset, 
                         batch_size=batch_size,
                         num_workers=0,
                         shuffle=False)

In [11]:
for images, labels in train_loader:  
    print('Image batch dimensions:', images.shape)
    print('Image label dimensions:', labels.shape)
    break

Image batch dimensions: torch.Size([64, 1, 28, 28])
Image label dimensions: torch.Size([64])


In [12]:
for images, labels in train_loader:  
    pass
print(labels[:10])

for images, labels in train_loader:  
    pass
print(labels[:10])

tensor([5, 7, 4, 9, 1, 7, 4, 1, 6, 7])
tensor([8, 2, 0, 7, 1, 3, 2, 6, 0, 4])


In [13]:
torch.manual_seed(123)
for images, labels in train_loader:  
    pass
print(labels[:10])

torch.manual_seed(123)
for images, labels in train_loader:  
    pass
print(labels[:10])

tensor([1, 0, 3, 7, 0, 7, 5, 6, 8, 3])
tensor([1, 0, 3, 7, 0, 7, 5, 6, 8, 3])


### 3. custom sampler

In [15]:
from torch.utils.data import sampler

In [16]:
class ChunkSampler(sampler.Sampler):
    """Samples elements sequentially from some offset. 
    Arguments:
        num_samples: # of desired datapoints
        start: offset where we should start selecting from
    """
    def __init__(self, num_samples, start = 0):
        self.num_samples = num_samples
        self.start = start

    def __iter__(self):
        return iter(range(self.start, self.start + self.num_samples))

    def __len__(self):
        return self.num_samples

In [18]:
training_transform = transforms.Compose([transforms.Resize((32, 32)),
                                         transforms.RandomCrop((28, 28)),
                                         transforms.ToTensor()])

valid_transform = transforms.Compose([transforms.Resize((32, 32)),
                                         transforms.CenterCrop((28, 28)),
                                         transforms.ToTensor()])

# train_dataset和valid_dataset的数据都是来自train数据集，但是使用不同transform
# test_dataset来自test数据集
train_dataset = datasets.MNIST(root='D:/work/data/Python/mnist/', 
                               train=True, 
                               transform=training_transform,
                               download=True)

valid_dataset = datasets.MNIST(root='D:/work/data/Python/mnist/', 
                               train=True, 
                               transform=valid_transform,
                               download=False)

test_dataset = datasets.MNIST(root='D:/work/data/Python/mnist/', 
                              train=False, 
                              transform=valid_transform,
                              download=False)

validation_fraction = 0.1
num = int(validation_fraction * 60000)
train_indices = torch.arange(0, 60000 - num)
valid_indices = torch.arange(60000 - num, 60000)

print("train_indices shape: ", train_indices.shape)
print("valid_indices shape: ", valid_indices.shape)
train_sampler = ChunkSampler(train_indices.shape[0], 0)
valid_sampler = ChunkSampler(valid_indices.shape[0], train_indices.shape[0])

train_loader = DataLoader(dataset=train_dataset,
                          batch_size=batch_size,
                          num_workers=0,
                          sampler=train_sampler)
valid_loader = DataLoader(dataset=valid_dataset,
                          batch_size=batch_size,
                          num_workers=0,
                          sampler=train_sampler)
test_loader = DataLoader(dataset=test_dataset,
                         batch_size=batch_size,
                         shuffle=False)

train_indices shape:  torch.Size([54000])
valid_indices shape:  torch.Size([6000])


In [19]:
for images, labels in train_loader:  
    print('Image batch dimensions:', images.shape)
    print('Image label dimensions:', labels.shape)
    break

Image batch dimensions: torch.Size([64, 1, 28, 28])
Image label dimensions: torch.Size([64])


In [20]:
for images, labels in train_loader:  
    pass
print(labels[:10])

for images, labels in train_loader:  
    pass
print(labels[:10])

tensor([8, 8, 3, 4, 7, 0, 3, 0, 3, 7])
tensor([8, 8, 3, 4, 7, 0, 3, 0, 3, 7])


In [21]:
torch.manual_seed(123)
for images, labels in train_loader:  
    pass
print(labels[:10])

torch.manual_seed(123)
for images, labels in train_loader:  
    pass
print(labels[:10])

tensor([8, 8, 3, 4, 7, 0, 3, 0, 3, 7])
tensor([8, 8, 3, 4, 7, 0, 3, 0, 3, 7])
