# Setup for KMNIST, EMNIST, FashionMNIST

In [None]:
!pip install idx2numpy

In [2]:
import os
import math

import requests
from tqdm import tqdm
import zipfile
import idx2numpy
import gzip

import torch
import torchvision
import torchvision.transforms as transforms
import numpy as np


In [4]:
os.makedirs("./data", exist_ok=True)
os.makedirs("./data/K49", exist_ok=True)
os.makedirs("./data/EMNIST", exist_ok=True)
os.makedirs("./data/FashionMNIST", exist_ok=True)

# Download the raw K49, EMNIST, and FashionMNIST datasets

## Download K49

In [5]:
url_list = ['http://codh.rois.ac.jp/kmnist/dataset/k49/k49-train-imgs.npz',
            'http://codh.rois.ac.jp/kmnist/dataset/k49/k49-train-labels.npz',
            'http://codh.rois.ac.jp/kmnist/dataset/k49/k49-test-imgs.npz',
            'http://codh.rois.ac.jp/kmnist/dataset/k49/k49-test-labels.npz']

for url in url_list:
    path = url.split('/')[-1]
    r = requests.get(url, stream=True)
    with open(path, 'wb') as f:
        total_length = int(r.headers.get('content-length'))
        print('Downloading {} - {:.1f} MB'.format(path, (total_length / 1024000)))

        for chunk in tqdm(r.iter_content(chunk_size=1024), total=int(total_length / 1024) + 1, unit="KB"):
            if chunk:
                f.write(chunk)
print('All dataset files downloaded!')

filenames = {'k49-train-imgs': ("train", "X.pt"),
             'k49-train-labels': ("train", "y.pt"),
             'k49-test-imgs': ("test", "X.pt"),
             'k49-test-labels': ("test", "y.pt")}

for filename in filenames.keys():
    with zipfile.ZipFile("./{}.npz".format(filename), 'r') as zip_ref:
        zip_ref.extractall("./{}-extracted".format(filename))

Downloading k49-train-imgs.npz - 64.6 MB


100%|██████████| 64569/64569 [01:23<00:00, 777.38KB/s] 


Downloading k49-train-labels.npz - 0.2 MB


100%|██████████| 161/161 [00:00<00:00, 178.99KB/s]


Downloading k49-test-imgs.npz - 10.7 MB


100%|██████████| 10715/10715 [00:17<00:00, 601.19KB/s]


Downloading k49-test-labels.npz - 0.0 MB


100%|██████████| 27/27 [00:00<00:00, 123.42KB/s]


All dataset files downloaded!


## Download EMNIST

In [6]:
url_list = ['https://biometrics.nist.gov/cs_links/EMNIST/gzip.zip']

for url in url_list:
    path = url.split('/')[-1]
    r = requests.get(url, stream=True)
    with open(path, 'wb') as f:
        total_length = int(r.headers.get('content-length'))
        print('Downloading {} - {:.1f} MB'.format(path, (total_length / 1024000)))

        for chunk in tqdm(r.iter_content(chunk_size=1024), total=int(total_length / 1024) + 1, unit="KB"):
            if chunk:
                f.write(chunk)
print('All dataset files downloaded!')


Downloading gzip.zip - 548.6 MB


100%|██████████| 548588/548588 [00:15<00:00, 35463.76KB/s]

All dataset files downloaded!





In [7]:
with zipfile.ZipFile("./gzip.zip", 'r') as zip_ref:
    zip_ref.extractall("./EMNIST-extracted")

## Download Fashion MNIST

In [8]:
url_list = [
    'https://raw.githubusercontent.com/zalandoresearch/fashion-mnist/master/data/fashion/t10k-labels-idx1-ubyte.gz',
    'https://raw.githubusercontent.com/zalandoresearch/fashion-mnist/master/data/fashion/t10k-images-idx3-ubyte.gz',
    'https://raw.githubusercontent.com/zalandoresearch/fashion-mnist/master/data/fashion/train-labels-idx1-ubyte.gz',
    'https://raw.githubusercontent.com/zalandoresearch/fashion-mnist/master/data/fashion/train-images-idx3-ubyte.gz'
]

for url in url_list:
    path = url.split('/')[-1]
    r = requests.get(url, stream=True)
    with open(path, 'wb') as f:
        total_length = int(r.headers.get('content-length'))
        print('Downloading {} - {:.1f} MB'.format(path, (total_length / 1024000)))

        for chunk in tqdm(r.iter_content(chunk_size=1024), total=int(total_length / 1024) + 1, unit="KB"):
            if chunk:
                f.write(chunk)
print('All dataset files downloaded!')






Downloading t10k-labels-idx1-ubyte.gz - 0.0 MB


100%|██████████| 6/6 [00:00<00:00, 8870.58KB/s]


Downloading t10k-images-idx3-ubyte.gz - 4.3 MB


100%|██████████| 4319/4319 [00:00<00:00, 40716.18KB/s]


Downloading train-labels-idx1-ubyte.gz - 0.0 MB


100%|██████████| 29/29 [00:00<00:00, 13401.81KB/s]


Downloading train-images-idx3-ubyte.gz - 25.8 MB


100%|██████████| 25803/25803 [00:00<00:00, 45313.25KB/s]

All dataset files downloaded!





# Process and Save Splits for All MNIST Datasets

In [3]:
def prepare_data(X_train, y_train, X_test, y_test):
    print(X_train.shape)
    print(X_train.mean())
    print(X_train.std())

    # Calculate mean and standard deviation for each channel
    mean = X_train.mean(axis=(1,2)).sum(0) / X_train.shape[0]
    std = X_train.std(axis=(1,2)).sum(0) / X_train.shape[0]

    transform_mean = [mean, mean, mean]
    transform_std =  [std, std, std]

    X_train, X_test = X_train.unsqueeze(1), X_test.unsqueeze(1)
    X_train = torch.cat((X_train, X_train, X_train), dim=1)
    X_test = torch.cat((X_test, X_test, X_test), dim=1)


    train_transform = transforms.Compose([
        transforms.Normalize(mean = transform_mean, std = transform_std),
    ])

    val_transform = transforms.Compose([
        transforms.Normalize(mean = transform_mean, std = transform_std),
    ])

    return train_transform(X_train), y_train, val_transform(X_test), y_test

### Split training set into train and validation sets
def split_dataset(X: torch.Tensor, y: torch.Tensor):
    torch.manual_seed(0)
    splitnum = math.floor(X.size()[0]*0.1)
    indices = torch.randperm(X.size()[0])
    X = X[indices]
    y = y[indices]
    X_val = X[:splitnum].clone().detach()
    y_val = y[:splitnum].clone().detach()
    X_train = X[splitnum:].clone().detach()
    y_train = y[splitnum:].clone().detach()
    print("trainsplit size: ", X_train.size(), " valsplit size: ", X_val.size())

    return X_train, y_train, X_val, y_val

## Process Kuzushiji-49 MNIST Splits

In [5]:
filenames = {'X_train': "./k49-train-imgs-extracted/arr_0.npy",
             'y_train': "./k49-train-labels-extracted/arr_0.npy",
             'X_test': "./k49-test-imgs-extracted/arr_0.npy",
             'y_test': "./k49-test-labels-extracted/arr_0.npy"}

X_train = np.load(filenames["X_train"])
y_train = np.load(filenames["y_train"])
X_train, y_train = torch.from_numpy(X_train).float(), torch.from_numpy(y_train)

X_test = np.load(filenames["X_test"])
y_test = np.load(filenames["y_test"])
X_test, y_test = torch.from_numpy(X_test).float(), torch.from_numpy(y_test)


X_train, y_train, X_test, y_test = prepare_data(X_train, y_train, X_test, y_test)
X_train, y_train, X_valid, y_valid = split_dataset(X_train, y_train)

for x in X_train, X_valid, X_test:
    print("X resulting mean {} and std {}".format(x.mean(), x.std()))

os.makedirs("./data/K49/train", exist_ok=True)
os.makedirs("./data/K49/valid", exist_ok=True)
os.makedirs("./data/K49/test", exist_ok=True)

torch.save(X_train, "./data/K49/train/X.pt")
torch.save(y_train, "./data/K49/train/y.pt")
torch.save(X_valid, "./data/K49/valid/X.pt")
torch.save(y_valid, "./data/K49/valid/y.pt")
torch.save(X_test, "./data/K49/test/X.pt")
torch.save(y_test, "./data/K49/test/y.pt")


torch.Size([232365, 28, 28])
tensor(45.9159)
tensor(87.2451)
trainsplit size:  torch.Size([209129, 3, 28, 28])  valsplit size:  torch.Size([23236, 3, 28, 28])
X resulting mean -0.00012973950651939958 and std 1.0398712158203125
X resulting mean 0.0011669847881421447 and std 1.0409965515136719
X resulting mean -0.01778692752122879 and std 1.02264404296875


## Process the Extended MNIST - Balanced Splits

In [6]:

with gzip.open("./EMNIST-extracted/gzip/emnist-balanced-train-images-idx3-ubyte.gz", 'rb') as f:
    X_train = idx2numpy.convert_from_file(f)
with gzip.open("./EMNIST-extracted/gzip/emnist-balanced-train-labels-idx1-ubyte.gz", 'rb') as f:
    y_train = idx2numpy.convert_from_file(f)
with gzip.open("./EMNIST-extracted/gzip/emnist-balanced-test-images-idx3-ubyte.gz", 'rb') as f:
    X_test = idx2numpy.convert_from_file(f)
with gzip.open("./EMNIST-extracted/gzip/emnist-balanced-test-labels-idx1-ubyte.gz", 'rb') as f:
    y_test = idx2numpy.convert_from_file(f)


X_train, y_train = torch.from_numpy(X_train).float(), torch.from_numpy(y_train)
X_test, y_test = torch.from_numpy(X_test).float(), torch.from_numpy(y_test)

X_train, y_train, X_test, y_test = prepare_data(X_train, y_train, X_test, y_test)
X_train, y_train, X_valid, y_valid = split_dataset(X_train, y_train)

for x in X_train, X_valid, X_test:
    print("X resulting mean {} and std {}".format(x.mean(), x.std()))

os.makedirs("./data/EMNIST/train", exist_ok=True)
os.makedirs("./data/EMNIST/valid", exist_ok=True)
os.makedirs("./data/EMNIST/test", exist_ok=True)

torch.save(X_train, "./data/EMNIST/train/X.pt")
torch.save(y_train, "./data/EMNIST/train/y.pt")
torch.save(X_valid, "./data/EMNIST/valid/X.pt")
torch.save(y_valid, "./data/EMNIST/valid/y.pt")
torch.save(X_test, "./data/EMNIST/test/X.pt")
torch.save(y_test, "./data/EMNIST/test/y.pt")


  X_train, y_train = torch.from_numpy(X_train).float(), torch.from_numpy(y_train)


torch.Size([112800, 28, 28])
tensor(44.6516)
tensor(84.9755)
trainsplit size:  torch.Size([101520, 3, 28, 28])  valsplit size:  torch.Size([11280, 3, 28, 28])
X resulting mean 7.095682667568326e-05 and std 1.020035743713379
X resulting mean -0.0006388546316884458 and std 1.019400715827942
X resulting mean 0.001280162250623107 and std 1.0206949710845947


# Process and save FashionMNIST splits

In [8]:
with gzip.open("./train-images-idx3-ubyte.gz", 'rb') as f:
    X_train = idx2numpy.convert_from_file(f)
with gzip.open("./train-labels-idx1-ubyte.gz", 'rb') as f:
    y_train = idx2numpy.convert_from_file(f)
with gzip.open("./t10k-images-idx3-ubyte.gz", 'rb') as f:
    X_test = idx2numpy.convert_from_file(f)
with gzip.open("./t10k-labels-idx1-ubyte.gz", 'rb') as f:
    y_test = idx2numpy.convert_from_file(f)

X_train, y_train = torch.from_numpy(X_train).float(), torch.from_numpy(y_train)
X_test, y_test = torch.from_numpy(X_test).float(), torch.from_numpy(y_test)

X_train, y_train, X_test, y_test = prepare_data(X_train, y_train, X_test, y_test)
X_train, y_train, X_valid, y_valid = split_dataset(X_train, y_train)

for x in X_train, X_valid, X_test:
    print("X resulting mean {} and std {}".format(x.mean(), x.std()))


os.makedirs("./data/FashionMNIST/train", exist_ok=True)
os.makedirs("./data/FashionMNIST/valid", exist_ok=True)
os.makedirs("./data/FashionMNIST/test", exist_ok=True)

torch.save(X_train, "./data/FashionMNIST/train/X.pt")
torch.save(y_train, "./data/FashionMNIST/train/y.pt")
torch.save(X_valid, "./data/FashionMNIST/valid/X.pt")
torch.save(y_valid, "./data/FashionMNIST/valid/y.pt")
torch.save(X_test, "./data/FashionMNIST/test/X.pt")
torch.save(y_test, "./data/FashionMNIST/test/y.pt")


torch.Size([60000, 28, 28])
tensor(72.9404)
tensor(90.0212)
trainsplit size:  torch.Size([54000, 3, 28, 28])  valsplit size:  torch.Size([6000, 3, 28, 28])
X resulting mean -0.0005339629133231938 and std 1.101234793663025
X resulting mean 0.004804421681910753 and std 1.105269432067871
X resulting mean 0.0025234385393559933 and std 1.099829912185669
