In [1]:
import torch
from torch import nn
from torch import optim
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler, SequentialSampler
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torchvision import transforms
import torchvision.models as models
import numpy as np
import pandas as pd
import os
from skimage import io
from skimage import color
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.metrics import roc_auc_score
from scipy.special import softmax
import matplotlib.pyplot as plt
import itertools
import time
import pdb
import re
from zipfile import ZipFile

#device = torch.device('cuda')

torch.manual_seed(1117)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(1117)

In [2]:
def make_df(root, path):
    filenames = []
    with ZipFile(root, 'r') as zfolder:
        for filename in zfolder.namelist():
            if filename.startswith(path + 'EOSINOPHIL'):
                filenames.append([os.path.basename(filename), 0])
            elif filename.startswith(path + 'LYMPHOCYTE'):
                filenames.append([os.path.basename(filename), 1])
            elif filename.startswith(path + 'MONOCYTE'):
                filenames.append([os.path.basename(filename), 2])
            elif filename.startswith(path + 'NEUTROPHIL'):
                filenames.append([os.path.basename(filename), 3])
    return pd.DataFrame(filenames, columns = ['filename', 'label'])

# Source 1: https://thispointer.com/python-how-to-get-the-list-of-all-files-in-a-zip-archive/
# Source 2: https://stackoverflow.com/questions/49625350/list-all-files-inside-a-folder-in-a-zip-file-in-python
# Source 3: https://stackoverflow.com/questions/16091904/python-zip-how-to-eliminate-absolute-path-in-zip-archive-if-absolute-paths-for

In [3]:
# test data is ~20%, so I made the validate data ~20%
def train_validate_split(df, train_size=0.75):
    eosinophil = df.loc[df['label'] == 0]
    etrain, evalidate = train_test_split(eosinophil, train_size = train_size, random_state = 1117)
    
    lymphocyte = df.loc[df['label'] == 1]
    ltrain, lvalidate = train_test_split(lymphocyte, train_size = train_size, random_state = 1117)
    
    monocyte = df.loc[df['label'] == 2]
    mtrain, mvalidate = train_test_split(monocyte, train_size = train_size, random_state = 1117)
    
    neutrophil = df.loc[df['label'] == 3]
    ntrain, nvalidate = train_test_split(neutrophil, train_size = train_size, random_state = 1117)
    
    train = pd.concat([etrain, ltrain, mtrain, ntrain], axis = 0)
    validate = pd.concat([evalidate, lvalidate, mvalidate, nvalidate], axis = 0)
    
    return train, validate

In [4]:
class BloodDataset(Dataset):
    """Blood Cell Images from https://www.kaggle.com/paultimothymooney/blood-cells."""

    def __init__(self, csv_file, root_dir, transform=None):
        """
        Args:
            csv_file (string): Path to the csv file filename information.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.data_frame = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.subfolder = ['EOSINOPHIL/', 'LYMPHOCYTE/', 'MONOCYTE/', 'NEUTROPHIL/']
        self.transform = transform

    def __len__(self):
        return len(self.data_frame)

    def __getitem__(self, idx):
        sf = self.subfolder[self.data_frame['label'].iloc[idx]]
        
        img_name = os.path.join(self.root_dir, sf, self.data_frame['filename'].iloc[idx])

        image = io.imread(img_name)
        
        # Added the following line to change image shape from (240, 320, 3) to (3, 240, 320)
        #image = np.repeat(image[None,...],3,axis=0)
        # Removed the above line, since it is not a clean transpose. Used following line instead.
        image = np.transpose(image, (2, 0, 1))
        
        image = (image - image.mean()) / image.std()
            
        image_class = self.data_frame['label'].iloc[idx]

        if self.transform:
            image = self.transform(image)
        
        sample = {'x': image, 'y': image_class}

        return sample

In [5]:
# root and partial_path are both based on unmodified data downloaded directly from source
root = './blood-cells.zip'
partial_path = 'dataset2-master/dataset2-master/images/'
full_path = root + '/' + partial_path
train_raw = make_df(root, partial_path + 'TRAIN/')
train, validate = train_validate_split(train_raw)
test = make_df(root, partial_path + 'TEST/')
test_simple = make_df(root, partial_path + 'TEST_SIMPLE/')

train_raw.shape, train.shape, validate.shape, test.shape, test_simple.shape

((9957, 2), (7466, 2), (2491, 2), (2487, 2), (71, 2))

In [6]:
# save the train/validate/test sets as .csv
train.to_csv('train.csv', sep=',', encoding='utf-8')
validate.to_csv('validate.csv', sep=',', encoding='utf-8')
test.to_csv('test.csv', sep=',', encoding='utf-8')
test_simple.to_csv('test_simple.csv', sep=',', encoding='utf-8')

In [7]:
# weights = [(654 / 78), 1.0, (654/149)]
# class_weights = torch.tensor(weights).to(device, dtype = torch.float)

# optimizer = optim.Adam(model.parameters(), lr=0.0001)
# loss = torch.nn.CrossEntropyLoss(weight = class_weights)

num_epochs = 20
batch_size = 4

dataset = {'train': BloodDataset('train.csv', full_path + 'TRAIN/'),
           'validate': BloodDataset('validate.csv', full_path + 'TRAIN/'),
           'test': BloodDataset('test.csv', full_path + 'TEST/'),
           'test_simple': BloodDataset('test_simple.csv', full_path + 'TEST_SIMPLE')}
dataloader = {x: DataLoader(dataset[x], batch_size = batch_size,
                           shuffle = True, num_workers = 0) for x in ['train', 'validate', 'test', 'test_simple']}
# lambda_func = lambda epoch: 0.5 ** epoch
# scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda = lambda_func)

In [8]:
# Ensure that our dataloader works as intended -- it does!
next(iter(dataloader['train']))['x'].shape, next(iter(dataloader['train']))['y'].shape

(torch.Size([4, 3, 240, 320]), torch.Size([4]))