In [17]:
import os
import torch
import torchvision as tv
import numpy as np

In [18]:
# constants
MAIN_DIR = 'data/leukemia/'
DATA_DIR = f'{MAIN_DIR}Original/'
ZIP_FILE = f'{MAIN_DIR}leukemia.zip'

In [19]:
# check if ./data dir is empty
if os.path.isdir(DATA_DIR) and len(os.listdir(DATA_DIR)) > 0:
    print('Data already extracted!\nSkipping ....')
else:
    os.system(f'unzip {ZIP_FILE} -d {MAIN_DIR}')
    print('\n\nData extracted!')

Data already extracted!
Skipping ....


In [20]:
DATA_PATHS = []  # list of test data paths
for dirname, _, filenames in os.walk(DATA_DIR):
    for filename in filenames:
      # check if is a file or a folder
        if not os.path.isdir(os.path.join(dirname, filename)):
          DATA_PATHS.append(os.path.join(dirname, filename))
          
# create a dictionary with the number of images for each folder of DATA_DIR
nImages={}
for dirname, _, filenames in os.walk(DATA_DIR):
  if len(filenames)>0:
    name = dirname.split('/')[-1]
    nImages[name]=len(filenames)
print(nImages)

{'Pre': 963, 'Benign': 504, 'Pro': 804, 'Early': 985}


In [21]:
# read the images as a tensor with torchvision
images = [tv.io.read_image(img) for img in DATA_PATHS]

In [22]:
# Assign the labels to the images (files are named with the correct classification)

# code: 
#  0 --> Benign
#  1 --> Early
#  2 --> Pre
#  3 --> Pro

data_len = len(DATA_PATHS)
labels = torch.zeros(data_len)

for i, img in enumerate(DATA_PATHS):
    labels[i] = 0 if 'Benign' in img else 1 if 'Early' in img else 2 if 'Pre' in img else 3

In [23]:
# If we don't do this intermediate step it does not work
images_np = np.array(images)

In [24]:
images_th = torch.tensor(images_np)

In [25]:
images_th.shape

torch.Size([3256, 3, 224, 224])

In [26]:
class Dataset(torch.utils.data.Dataset):
    """
    overload dataset
    """

    def __init__ (self, images, labels):
        self.images = images
        self.labels = labels
        self.images = self.images.float()
        self.labels = self.labels.long()

    def __len__ (self): 
        return(len(self.images))

    def __getitem__ (self, idx):
        return self.images[idx], self.labels[idx]

In [27]:
my_dataset = Dataset(images_th, labels)

In [28]:
torch.save(my_dataset, f'{MAIN_DIR}leukemia_dataset.pt')

In [29]:
# Define a toy_dataset with smaller numbers: 
size = 100
images_th_benign = images_th[0:size]
images_th_early = images_th[nImages['Benign'] : nImages['Benign']+size]
images_th_pre = images_th[nImages['Benign']+nImages['Early'] : nImages['Benign']+nImages['Early']+size]
images_th_pro = images_th[nImages['Benign']+nImages['Early']+nImages['Pre'] : nImages['Benign']+nImages['Early']+nImages['Pre']+size]
labels_benign = labels[0:size]
labels_early = labels[nImages['Benign'] : nImages['Benign']+size]
labels_pre = labels[nImages['Benign']+nImages['Early'] : nImages['Benign']+nImages['Early']+size]
labels_pro = labels[nImages['Benign']+nImages['Early']+nImages['Pre'] : nImages['Benign']+nImages['Early']+nImages['Pre']+size]

In [30]:
images_combined = torch.cat((images_th_benign, images_th_early, images_th_pre, images_th_pro), dim=0)
labels_combined = torch.cat((labels_benign, labels_early, labels_pre, labels_pro), dim=0)

In [31]:
small_dataset = Dataset(images_combined, labels_combined)

In [32]:
torch.save(small_dataset, f'{MAIN_DIR}leukemia_small_dataset.pt')