In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import numpy as np
from torch.utils.data import DataLoader
from tddl.data.loaders import get_f_mnist_loader

In [3]:
data_dir = Path("/bigdata/f_mnist")
batch = 256
data_workers = 16

train_dataset, valid_dataset, test_dataset = get_f_mnist_loader(data_dir)

train_loader = DataLoader(train_dataset, batch_size=batch, num_workers=data_workers)
valid_loader = DataLoader(valid_dataset, batch_size=batch, num_workers=data_workers)
test_loader = DataLoader(test_dataset, batch_size=batch, num_workers=data_workers)

In [7]:
def count_labels(loader, n_labels=10):
    counts = np.zeros(shape=(n_labels,))
    for images, labels in loader:
        values, count = np.unique(labels.numpy(), return_counts=True)
        counts[values] += count
    return counts

In [5]:
train_counts = count_labels(loader=train_loader)
train_counts

array([4926., 4973., 5015., 5055., 5026., 5001., 4967., 5048., 5021.,
       4968.])

In [6]:
valid_counts = count_labels(loader=valid_loader)
valid_counts

array([1074., 1027.,  985.,  945.,  974.,  999., 1033.,  952.,  979.,
       1032.])

In [7]:
test_counts = count_labels(loader=test_loader)
test_counts

array([1000., 1000., 1000., 1000., 1000., 1000., 1000., 1000., 1000.,
       1000.])

## Stratified Sampling

In [5]:
import torch
from torch.utils import data
from torchvision import datasets
from torchvision import models
from torchvision import transforms
from tddl.data.sets import DatasetFromSubset

path = data_dir
transform_train = transforms.Compose([
    transforms.RandomCrop(28, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,)),
])

dataset = datasets.FashionMNIST(path, train=True, download=True, transform=transform_test)

# train_dataset, valid_dataset = data.random_split(
#     dataset,
#     (50000, 10000),
#     generator=torch.Generator().manual_seed(42),
# )

# train_dataset = DatasetFromSubset(
#     train_dataset, transform=transform_train,
# )

# valid_dataset = DatasetFromSubset(
#     valid_dataset, transform=transform_test,
# )

In [6]:
batch = 50000
data_workers = 16
dataloader = DataLoader(dataset, batch_size=batch, num_workers=data_workers)

count = count_labels(dataloader)
count

NameError: name 'count_labels' is not defined

array([6000., 6000., 6000., 6000., 6000., 6000., 6000., 6000., 6000.,
       6000.])

In [16]:
dataset.data

tensor([[[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        ...,

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0,

In [17]:
dataset.targets

tensor([9, 0, 0,  ..., 3, 0, 5])

In [18]:
len(valid_dataset)

10000

In [19]:
len(train_dataset)

50000

In [23]:
from sklearn.model_selection import StratifiedKFold

k_folds = 10
kfold = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)
a, b = next(kfold.split(dataset, dataset.targets))

In [29]:
len(a)

54000

In [30]:
len(b)

6000

In [8]:
from sklearn.model_selection import train_test_split
import numpy as np

num_train = len(dataset)
print(num_train)
indices = list(range(num_train))
print(indices[:5])

train_idx, valid_idx, y_train, y_test = train_test_split(indices, dataset.targets, test_size=5000, stratify=dataset.targets, random_state=42)

60000
[0, 1, 2, 3, 4]


In [9]:
print(len(train_idx))

print(len(valid_idx))

55000
5000


In [10]:
from torch.utils.data.sampler import SubsetRandomSampler

train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

In [11]:
from torch.utils.data import DataLoader

BATCH_SIZE = 256

train_dataset = datasets.FashionMNIST(
    root=data_dir, train=True,
    download=True, transform=transform_train,
)

valid_dataset = datasets.FashionMNIST(
    root=data_dir, train=True,
    download=True, transform=transform_test,
)

train_loader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler,
    num_workers=data_workers,
)
valid_loader = DataLoader(
    valid_dataset, batch_size=BATCH_SIZE, sampler=valid_sampler,
    num_workers=data_workers,
)

In [12]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x7f9091100940>

In [13]:
len(valid_loader.dataset)

60000

In [14]:
print(len(train_loader))
count_labels(train_loader)

215


array([5500., 5500., 5500., 5500., 5500., 5500., 5500., 5500., 5500.,
       5500.])

In [16]:
print(len(valid_loader))
count_labels(valid_loader)

20


array([500., 500., 500., 500., 500., 500., 500., 500., 500., 500.])