In [95]:
import numpy as np
import os, glob

from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import TensorDataset

seed = 42
np.random.seed(seed)

verbose = True
subset = True
subset_fraction = 0.10

In [65]:
def load_data(file_list, v=True):
    """ Loads numpy array data such as training set from list of files """
    if v:
        print("Loading data from:", file_list, end="\n")

    images_list, labels_list = [], []
    for i in range(len(file_list)):
        images, labels = np.load(file_list[i])["images"], np.load(file_list[i])["labels"]
        images_list.append(images)
        labels_list.append(labels)

    # Stack
    Images, Labels = np.vstack(images_list), np.concatenate(labels_list)

    return(Images, Labels)

In [None]:
def extract_save_train_valid_test(dataDir, outDir, subset=True, subset_fraction=0.10, seed=seed):
    """ Saves pre-processed tensor datasets from loaded files in dataDir
    
    Args:
        dataDir: Directory with training and testing numpy array files
        outDir: Directory to save pre-processed train, validation and test dataset tensor files
        subset: If True, Load only one training file
        subset_fraction: Fraction of training dataset to use for validation
        seed: Seed used for training / validation split

    Returns:
        Nothing
    """
    train_list = glob.glob(dataDir + "/train*.npz")
    test_list = glob.glob(dataDir + "/test*.npz") 

    if subset:
        train_list = train_list[0:1]

    # Training and validation dataset
    X_train, y_train = load_data(train_list)
    # Split by subset_fraction
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train,
                                         test_size=subset_fraction,
                                         random_state=seed)

    # Test dataset
    X_test, y_test = load_data(test_list)

    if verbose:
        print(f"Subsampled {subset_fraction} of training dataset ...")
        print(f"X_train {X_train.shape}, y_train {y_train.shape}")
        print(f"X_valid {X_valid.shape}, y_valid {y_valid.shape}")
        print(f"X_test {X_test.shape}, y_test {y_test.shape}")

    # Convert to tensor
    X_train, y_train = torch.from_numpy(X_train), torch.from_numpy(y_train)
    X_valid, y_valid = torch.from_numpy(X_valid), torch.from_numpy(y_valid)
    X_test, y_test = torch.from_numpy(X_test), torch.from_numpy(y_test)

    # Save as torch dataset
    if verbose:
        print(f"Saving train.pt, valid.pt and test.pt to {outDir}")

    torch.save(TensorDataset(X_train, y_train), outDir + "/train.pt")
    torch.save(TensorDataset(X_valid, y_valid), outDir + "/valid.pt")
    torch.save(TensorDataset(X_test, y_test), outDir + "/test.pt")

In [98]:
dataDir = "../data/raw/corruptmnist/"
outDir = "../data/processed/"

train_list = glob.glob(dataDir + "/train*.npz")
test_list = glob.glob(dataDir + "/test*.npz") 

if subset:
    train_list = train_list[0:1]

# Training with train and validation split
X_train, y_train = load_data(train_list)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.20, random_state=42)

# Testing
X_test, y_test = load_data(test_list)

if verbose:
    print(f"Subsampled {subset_fraction} of training dataset ...")
    print(f"X_train {X_train.shape}, y_train {y_train.shape}")
    print(f"X_valid {X_valid.shape}, y_valid {y_valid.shape}")
    print(f"X_test {X_test.shape}, y_test {y_test.shape}")

# Convert to tensor and save
X_train, y_train = torch.from_numpy(X_train), torch.from_numpy(y_train)
X_valid, y_valid = torch.from_numpy(X_valid), torch.from_numpy(y_valid)
X_test, y_test = torch.from_numpy(X_test), torch.from_numpy(y_test)

# Convert to torch dataset

# Save
if verbose:
    print(f"Saving train.pt, valid.pt and test.pt to {outDir}")
torch.save(TensorDataset(X_train, y_train), outDir + "/train.pt")
torch.save(TensorDataset(X_valid, y_valid), outDir + "/valid.pt")
torch.save(TensorDataset(X_test, y_test), outDir + "/test.pt")

Loading data from: ['../data/raw/corruptmnist/train_1.npz']
Loading data from: ['../data/raw/corruptmnist/test.npz']
Subsampled 0.1 of training dataset ...
X_train (4000, 28, 28), y_train (4000,)
X_valid (1000, 28, 28), y_valid (1000,)
X_test (5000, 28, 28), y_test (5000,)
Saving train.pt, valid.pt and test.pt to ../data/processed/


In [97]:
torch.load(outDir + "/test.pt")

<torch.utils.data.dataset.TensorDataset at 0x7fcf4fbac5e0>

In [74]:
ds = TensorDataset(X_train, y_train)

In [77]:
a, b = ds.tensors

In [47]:
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

((4000, 28, 28), (4000,), (1000, 28, 28), (1000,))

In [87]:
next(iter(dl))

[tensor([[[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000, 