In [1]:
import os
import torch
import torchvision as tv
import numpy as np

In [2]:
# constants
ZIP_FILE='BrainCancer.zip'
DATA_DIR = 'data/Brain Cancer'

In [3]:
# check if ./data dir is empty (expected only the .gitkeep and .gitattributes files)
if len(os.listdir('./data')) > 3:
    print('Data already extracted!\nskipping ....')
else:
    os.system(f'unzip {ZIP_FILE} -d ./data')
    print('\n\nData extracted!')

Data already extracted!
skipping ....


In [4]:
DATA_PATHS = []  # list of test data paths
for dirname, _, filenames in os.walk(DATA_DIR):
    for filename in filenames:
      # check if is a file or a folder
        if os.path.isdir(os.path.join(dirname, filename)):
            continue
        DATA_PATHS.append(os.path.join(dirname, filename))

In [5]:
# read the images as a tensor with torchvision
images = [tv.io.read_image(img) for img in DATA_PATHS]

In [6]:
# Assign the labels to the images (files are named with the correct classification)

# code: 
#  1 --> brain glioma
#  2 --> meningioma
#  3 --> tumor

data_len = len(DATA_PATHS)
labels = torch.zeros(data_len)

for i, img in enumerate(DATA_PATHS):
    if 'glioma' in img:
        labels[i] = 1
    elif 'menin' in img:
        labels[i] = 2
    elif 'tumor' in img:
        labels[i] = 3

In [7]:
# If we don't do this intermediate step it does not work
images_np = np.array(images)

In [8]:
images_th = torch.tensor(images_np)

In [9]:
images_th.shape

torch.Size([15000, 3, 512, 512])

In [10]:
class Dataset(torch.utils.data.Dataset):
    """
    overload dataset
    """

    def __init__ (self, x, y):
        self.x = x
        self.y = y
        # enforce to use float32 or some models will complain
        self.x = self.x.float()
        self.y = self.y.float()

    def __len__ (self): 
        return(len(self.y))

    def __getitem__ (self, idx):
        return self.x[idx], self.y[idx]

In [11]:
my_dataset = Dataset(images_th, labels)

In [15]:
torch.save(my_dataset, 'data/brain_cancer_dataset.pt')

In [16]:
# Define a toy_dataset with smaller numbers: 

images_th_glioma = images_th[0:100]
images_th_meningioma = images_th[5000:5100]
images_th_tumor = images_th[10000:10100]
labels_glioma = labels[0:100]
labels_meningioma = labels[5000:5100]
labels_tumor = labels[10000:10100]

In [17]:
images_combined = torch.cat((images_th_glioma, images_th_meningioma, images_th_tumor), dim=0)
labels_combined = torch.cat((labels_glioma, labels_meningioma, labels_tumor), dim=0)

In [18]:
small_dataset = Dataset(images_combined, labels_combined)

In [19]:
torch.save(small_dataset, 'data/brain_cancer_dataset_small.pt')