## Load relevant packages

In [None]:
# Import relevant packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import cv2

import tarfile
from os.path import isdir, isfile
from os import remove

import scipy.ndimage
from sklearn.preprocessing import LabelBinarizer
from tqdm import tqdm

from joblib import Parallel, delayed
import multiprocessing

%matplotlib inline

### Find path names of files and prepare one-hot encoder

We begin by listing all image-file paths to prepare for training. We also create a one-hot encoder for later use.

In [None]:
training_folders = ["./Data/train"] #+ ["./Data/Type_1", "./Data/Type_2", "./Data/Type_3"]
testing_folder = "./Data/test"

def all_image_paths(folderpath):
    """
    Returns a list of filenames containing 'jpg'. The returned list has sublists with filenames,
    where each sublist is a different folder.
    """
    image_pathnames = [[folderandfiles[0]+"/"+imname for imname in folderandfiles[2] if "jpg" in imname] 
                          for folderandfiles in os.walk(folderpath) if folderandfiles[2]!=[]]
    image_pathnames = [folder for folder in image_pathnames if folder != []]
    return image_pathnames


# We first get all path-names for the training and testing images
training_pathnames = sum([all_image_paths(folder) for folder in training_folders], [])
testing_pathnames = all_image_paths(testing_folder)

def get_Type(filepath): #formerly get_letter
    """
    Returns the type corresponding to an image found in filepath
    """
    # The type number is given by the name of the folder in which we find the image
    indexname = filepath.rfind("/")
    letter = filepath[indexname-6:indexname]
    return letter

# In each folder all images depict the same cervical type
all_Types = np.sort([get_Type(pathname[0]) for pathname in training_pathnames])

# We may now make the function that one-hot-encodes Types into arrays
enc = LabelBinarizer()
enc.fit(all_Types)

def one_hot_encode(list_of_types):
    """
    One hot encode a list of Types. Returns a one-hot encoded vector for each Type.
    """
    return enc.transform(list_of_types)

# We now flatten the lists of path names
training_pathnames = np.array(sum(training_pathnames, []))
testing_pathnames = np.array(sum(testing_pathnames, []))

# When training, we don't want the images to be ordered. Therefore, we take a 
# random permutation of their order.
np.random.seed(42)
training_pathnames = np.random.permutation(training_pathnames)
print("We are training on {} images".format(len(training_pathnames)))
print("We are testing on {} images".format(len(testing_pathnames)))

### Load images and labels into arrays, save to disk

We first normalize the pixel-values to lie between 0 and 1. We also reshape each image to be a 3-dimensional array: (x_length, y_length, color_channels). 

Finally, we save the arrays to disk.

#### Choose an appropriate resizing of the images

In [None]:
# The insput is a list of matrices describing the pixels. The matrices are not required to all have the same resolution.
def drawSlices(list_of_image_matrices, figsizes=(16,4)):
    # We'll make a figure with subplots in it
    numberofrows = int(np.ceil(len(list_of_image_matrices) / 6.))
    fig, axes = plt.subplots(nrows=numberofrows, ncols=6, figsize=(figsizes[0],figsizes[1]*numberofrows))
    # Now on each axis we can draw the slice
    for (currentax, currentimage) in zip(axes.ravel(), list_of_image_matrices):
        currentax.imshow(currentimage, cmap="gray")
        # The ticks are useless and ugly
        currentax.set_xticks([])
        currentax.set_yticks([])
    # Finally we remove those plots that have nothing in them
    for remainingax in axes.ravel()[len(list_of_image_matrices):]:
        remainingax.axis("off")
    return fig, axes

In [None]:
drawSlices([scipy.ndimage.imread(impath) for impath in training_pathnames[7:13]]);

In [None]:
resize_shape = (150, 150, 3)

imagearray = scipy.ndimage.imread(training_pathnames[17])
resized_imagearray = scipy.misc.imresize(imagearray, resize_shape)
drawSlices([imagearray, resized_imagearray]);

150 to 300 pixels on the x and y axes seems like an appropriate resizing, where we don't lose too much releveant info.

In [None]:
def load_normalize_image(path, resize_shape=(150, 150, 3)):
    """
    Takes the directory path of an image and returns a normalized
    3-dimensional array representing that image.
    """
    # First we load the image
    try:
        imagearray = scipy.ndimage.imread(path)
        # There is no need to reshape the image to be three-dimensional; they already are.
        # We do want to resize it however.
        imagearray = scipy.misc.imresize(imagearray, resize_shape)
        # Now we normalize it 
        imagearray = imagearray / 255.
        return imagearray
    except:
        # If some images are broken in the database; these will raise errors.
        pass
    
def array_all_images(list_of_path_names, parallelize=False):
    """
    Takes a list of directory paths of images and returns a 4-dimensional array
    containing the pixel-data of those images. The shape is:
    (num_images, x_dim, y_dim, num_colors)
    """
    if parallelize:
        num_cores = multiprocessing.cpu_count()
        all_images = Parallel(n_jobs=num_cores)(delayed(load_normalize_image)(path) 
                                               for path in list_of_path_names)
    else:
        all_images = [load_normalize_image(path) for path in list_of_path_names]
    # Some of these might be None since the function load_normalize_image
    # does not load broken images. We now remove these Nones.
    all_images = [img for img in all_images if img is not None] # IN PYTHON 3 np.array(list(filter(None.__ne__, all_images)))
    return all_images

def load_Type(path): #formerly load_letter
    """
    Takes the directory path of an image and returns the label of the image.
    """
    # First we see if it is possible to load the image
    try:
        imagearray = scipy.ndimage.imread(path)
        imagearray = scipy.misc.imresize(imagearray, resize_shape)
        # If this didn't give an error, we may get the letter
        return get_Type(path)
    except:
        # Some images are broken in the database; these will raise errors.
        pass

def array_all_labels(list_of_path_names, parallelize=False):
    """
    Takes a list of directory paths of images and returns a 2-dimensional array
    containing the one-hot-encoded labels of those images
    """
    if parallelize:
        num_cores = multiprocessing.cpu_count()
        the_types = Parallel(n_jobs=num_cores)(delayed(load_Type)(path) for path in list_of_path_names)
    else:
        the_types = [load_Type(path) for path in list_of_path_names]
    the_types = [typ for typ in the_types if typ is not None] # IN PYTHON 3: list(filter(None.__ne__, the_types))
    all_labels = one_hot_encode(the_types)
    return all_labels

def batch_list(inputlist, batch_size):
    """
    Returns the inputlist split into batches of maximal length batch_size.
    Each element in the returned list (i.e. each batch) is itself a list.
    """
    list_of_batches = [inputlist[ii: ii+batch_size] for ii in range(0, len(inputlist), batch_size)]
    return list_of_batches

tensorflow_folder = "./TensorFlow_data"
training_subfolder = "/training_data"
testing_subfolder = "/testing_data"

# We store all the data in a training and testing folder
training_folder = tensorflow_folder + training_subfolder
testing_folder = tensorflow_folder + testing_subfolder
if not os.path.exists(training_folder):
    os.makedirs(training_folder)
if not os.path.exists(testing_folder):
    os.makedirs(testing_folder)

# Make the input data and labels for the testing set
testingarrayimage_path = "/testing_images.npy"
if isfile(testing_folder + testingarrayimage_path) == False:
    testing_images = array_all_images(testing_pathnames, parallelize=True)
    np.save(testing_folder + testingarrayimage_path, testing_images)

# Here we specify the size of each batch
batch_size = 2**7

# Now we save the batch-data, unless it already exists
training_pathnames_batches = batch_list(training_pathnames, batch_size)
num_saved_batches = sum(["training_images_batch" in filename 
                         for filename in list(os.walk(training_folder))[0][2]])

# If we have a different number of batches saved comapred to what we want,
# the batches are wrong and need recomputing.
if num_saved_batches != len(training_pathnames_batches):
    # We could delete the old files, but this is dangerous, since a 
    # typo could remove all files on the computer. We simply overwrite the files we have
    for ii, batch in enumerate(tqdm(training_pathnames_batches)):
        training_images_batch = array_all_images(batch, parallelize=True)
        np.save(training_folder + "/training_images_batch" + str(ii) + ".npy", training_images_batch)

        training_labels_batch = array_all_labels(batch, parallelize=True)
        np.save(training_folder + "/training_labels_batch" + str(ii) + ".npy", training_labels_batch)