***DATA PREPROCESSING***

Before loading the images and the labels (bounding boxes) into the model we need to ensure the data is in the right format. Also, we need to make sure we have enough images to train the model properly (data augmentation).





In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os

#http://imageio.readthedocs.io/en/latest/userapi.html#module-imageio.core.functions

In [2]:
#the images are saved in two folders so we need to make sure we load all images. Let's start with the images and not the countour
path_name = ['C:\\Users\\martar\\OneDrive\\Documents\\Python course\\ADS\\Final Project\\melanoma','C:\\Users\\martar\\OneDrive\\Documents\\Python course\\ADS\\Final Project\\notmelanoma']

#We want to have two arrays melanoma and notmelanoma that inclure all images, so the shape of this array is going to be = number of images, image_dimension x, image_dimension y, number of channels

list_files = os.listdir(path_name[1])

melanoma_images_list = [name for name in os.listdir(path_name[0]) if name.endswith('orig.jpg')]
melanoma_contour_list = [name for name in os.listdir(path_name[0]) if name.endswith('contour.png')]

notmelanoma_images_list = [name for name in os.listdir(path_name[1]) if name.endswith('orig.jpg')]
notmelanoma_contour_list = [name for name in os.listdir(path_name[1]) if name.endswith('contour.png')]



***STEP 0: SPLIT DATA INTO TRAIN AND TEST BEFORE WE DO ANYTHING ELSE TO AVOID LEAKAGE***

In [3]:

from sklearn.model_selection import train_test_split
melanoma_images_train, melanoma_images_test = train_test_split(melanoma_images_list,test_size=0.2, random_state=123)
notmelanoma_images_train, notmelanoma_images_test = train_test_split(notmelanoma_images_list,test_size=0.2, random_state=123)


***STEP 1: DATA AUGMENTATION AND RESHAPE FUNCTION***

Neural networks are normally trained with datasets of thousands of images. So...what we do to overcome that problem when we have limited number of images?? We can apply transformations to our images: rotation, flip, etc so we can generate more samples for the exiting one. There are many techniques to do this and keras has a module that already does this. However, because we need to apply the same transformation to the shape and the contour, we are going to implement our own pipeline

https://medium.com/nanonets/how-to-use-deep-learning-when-you-have-limited-data-part-2-data-augmentation-c26971dc8ced

https://www.researchgate.net/figure/Geometric-transformation-functions-for-image-registration-Note-that-affine_fig11_236125496

Great tutorial on data augmentation

https://github.com/aleju/imgaug



In [93]:
#Function to augment images

from skimage.transform import rotate, resize
from imgaug import augmenters as iaa
#This is a package built for image augmentation

def image_aug(input_image, image_size):
    
    origin_reshaped = resize(input_image,image_size)
    flip_image = np.fliplr(origin_reshaped)
    fliphor_image = np.flipud(origin_reshaped)
    rot45_image =  rotate(origin_reshaped, angle=45, mode='reflect')
    rot90_image =  rotate(origin_reshaped, angle=90, mode='reflect')
    crop = iaa.Crop(percent=(0, 0.3))
    crop_image = crop.augment_image(origin_reshaped)
    affine =  iaa.Affine(scale=1.5)
    affine_image = affine.augment_image(origin_reshaped)
    return origin_reshaped, flip_image, fliphor_image, rot45_image, rot90_image, crop_image, affine_image


In [94]:
#Testing random crop
from skimage.io import imread
seq = iaa.Affine(scale=1.5)
image_test = 'C:\\Users\\martar\\OneDrive\\Documents\\Python course\\ADS\\Final Project\\notmelanoma\\10_33_orig.jpg'

image = seq.augment_image(imread(image_test))

image_size_test = (192,192)
augmented_matrix = image_aug(imread(image_test), image_size_test)
augmented_matrix

  warn("The default mode, 'constant', will be changed to 'reflect' in "


(array([[[0.77107928, 0.7373466 , 0.69892961],
         [0.70521472, 0.65639936, 0.6080059 ],
         [0.72773033, 0.66498523, 0.60616171],
         ...,
         [0.74302364, 0.68812168, 0.64498443],
         [0.72143204, 0.65476537, 0.62004315],
         [0.76185555, 0.69518889, 0.66046666]],
 
        [[0.62476894, 0.58334738, 0.54192581],
         [0.59113626, 0.53562155, 0.48231081],
         [0.54899089, 0.48086512, 0.41739047],
         ...,
         [0.58021663, 0.51431143, 0.46729665],
         [0.56282744, 0.49726371, 0.4547392 ],
         [0.61593073, 0.55036701, 0.5078425 ]],
 
        [[0.64089946, 0.58695172, 0.53471563],
         [0.56116919, 0.49434487, 0.4318653 ],
         [0.57683441, 0.5035052 , 0.42904837],
         ...,
         [0.61426462, 0.53975482, 0.48432181],
         [0.5635506 , 0.5008055 , 0.44876302],
         [0.57006293, 0.50731784, 0.45527535]],
 
        ...,
 
        [[0.62393216, 0.53373609, 0.47152203],
         [0.52973856, 0.44738562, 0.36747

***STEP 2: AUGMENT AND SAVE IMAGES AND COUNTOURS IN THE FOLDER***

In [95]:
import imageio as io
from skimage.io import imread

def aug_and_save_image(input_image, path_name, image_size):
    input_image_name = input_image
    new_image_path = path_name+'\\preprocessed_data'+'\\'+input_image
    input_image = imread(path_name+'\\'+input_image)
    input_image_array = np.asarray(input_image)
    augmented_array = image_aug(input_image_array, image_size)
    file_name = new_image_path[:-4]
    file_format = input_image_name[-4:]
    augmentation_list = ['origin_reshaped', 'flip_image', 'fliphor_image', 'rot45_image', 'rot90_image', 'crop_image', 'affine_image']
    for index, augmentation in enumerate(augmentation_list):
        #save image
        if augmentation == 'origin_reshaped':
            io.imsave(file_name + file_format,augmented_array[index])
        else:
            io.imsave(file_name + augmentation+ file_format,augmented_array[index])
    

In [96]:
def get_bounding_box(input_image,path_name):
    input_image_name = input_image
    input_image = imread(path_name+'\\'+input_image)
    tmp = np.argwhere(input_image)
    top_left = np.amin(tmp, axis=0)
    bottom_right = np.amax(tmp, axis=0)
    output_matrix  = [top_left,bottom_right]
    file_name = path_name+'\\'+input_image_name[:-4]+'_bounding_box.txt'
    np.savetxt(file_name, output_matrix)

In [97]:
#Create a new folder for the augmented and reshaped images
os.makedirs(path_name[0]+'\\preprocessed_data')
os.makedirs(path_name[1]+'\\preprocessed_data')


In [98]:
#Saving melanoma test images 
image_size = (192,192)
for name in melanoma_images_test:
    io.imsave(path_name[0]+'\\preprocessed_data\\'+'test'+name,resize(imread(path_name[0]+'\\'+name),image_size))
    contour = name.replace("orig.jpg","contour.png")
    io.imsave(path_name[0]+'\\preprocessed_data\\'+'test'+contour,resize(imread(path_name[0]+'\\'+contour),image_size))
    
#Saving notmelanoma test images
for name in notmelanoma_images_test:
    io.imsave(path_name[1]+'\\preprocessed_data\\'+'test'+name,resize(imread(path_name[1]+'\\'+name),image_size))
    contour = name.replace("orig.jpg","contour.png")
    io.imsave(path_name[1]+'\\preprocessed_data\\'+'test'+contour,resize(imread(path_name[1]+'\\'+contour),image_size))
    
    

  warn("The default mode, 'constant', will be changed to 'reflect' in "
  dtype_str, out_type.__name__))


In [99]:
melanoma_images_test[1]

'75_SSM30_2_orig.jpg'

In [100]:
image_size = (192,192)
#augmenting and saving traininig melanoma images
for name in melanoma_images_train:
    aug_and_save_image(name, path_name[0], image_size)
    contour = name.replace("orig.jpg","contour.png")
    aug_and_save_image(contour, path_name[0], image_size)
#augmenting and saving traininig notmelanoma images
for name in notmelanoma_images_train:
    aug_and_save_image(name, path_name[1], image_size)
    contour = name.replace("orig.jpg","contour.png")
    aug_and_save_image(contour, path_name[1], image_size)

  warn("The default mode, 'constant', will be changed to 'reflect' in "
  dtype_str, out_type.__name__))
  'range [{2}, {3}]'.format(dtype_str, out_type.__name__, mi, ma))
  'range [{2}, {3}]'.format(dtype_str, out_type.__name__, mi, ma))
  'range [{2}, {3}]'.format(dtype_str, out_type.__name__, mi, ma))
  'range [{2}, {3}]'.format(dtype_str, out_type.__name__, mi, ma))
  'range [{2}, {3}]'.format(dtype_str, out_type.__name__, mi, ma))
  'range [{2}, {3}]'.format(dtype_str, out_type.__name__, mi, ma))
  'range [{2}, {3}]'.format(dtype_str, out_type.__name__, mi, ma))
  'range [{2}, {3}]'.format(dtype_str, out_type.__name__, mi, ma))
  'range [{2}, {3}]'.format(dtype_str, out_type.__name__, mi, ma))
  'range [{2}, {3}]'.format(dtype_str, out_type.__name__, mi, ma))
  'range [{2}, {3}]'.format(dtype_str, out_type.__name__, mi, ma))
  'range [{2}, {3}]'.format(dtype_str, out_type.__name__, mi, ma))
  'range [{2}, {3}]'.format(dtype_str, out_type.__name__, mi, ma))
  'range [{2}, {3}]'.for

In [101]:
#Get the bounding boxes for all images:
new_path_name = [path +'\\preprocessed_data' for path in path_name]

for path in new_path_name:
    new_list_files = os.listdir(path)
    list_augmented = [name for name in new_list_files if 'contour' in name]
    for name2 in list_augmented:
        get_bounding_box(name2,path)
    


***STEP 3: GET THE DATA IN THE RIGHT FORMAT ***

[filename,width,height, class, xmin, ymin, xmax,ymax]

raccoon-17.jpg	259	194	raccoon	95	60	167	118


In [103]:
entry_df = pd.DataFrame()

for index_path, path in enumerate(new_path_name):
   
    
    #filename = name of the original image
    file_name_list = [name for name in os.listdir(path) if 'orig' in name]
    
    #As we reshaped all the images to be 200*200
    image_width = np.zeros(shape = len(file_name_list))
    image_width.fill(192)
    image_height = np.zeros(shape = len(file_name_list))
    image_height.fill(192)

    #for now, we are only looking at melanoma 
    image_class = [np.empty(shape = len(file_name_list))]
    
    if index_path == 0:
        image_class = "melanoma"
    else:
        image_class = "notmelanoma"


    #xmin, ymin = top_left from the bounding box function
    #xmax, ymax = bottom_right from the bounding box funtion

    x_min_list = np.zeros(shape = len(file_name_list))
    y_min_list = np.zeros(shape = len(file_name_list))
    x_max_list = np.zeros(shape = len(file_name_list))
    y_max_list = np.zeros(shape = len(file_name_list))

    for index,file_name in enumerate(file_name_list):
        file_name_search = file_name.replace('orig','contour')[:-4]+'_bounding_box.txt'
        found_file_name = [name for name in os.listdir(path) if file_name_search in name]
        box_array = np.loadtxt(path+"\\"+ found_file_name[0])
    
        x_min_list[index] = box_array[0][0]
        y_min_list[index] = box_array[0][1]
        x_max_list[index] = box_array[1][0]
        y_max_list[index] = box_array[1][1]
        
    entry = {"filename":file_name_list, 
             "width": image_width, 
             "height": image_height, 
             "class": image_class, 
             "xmin": x_min_list, 
             "ymin": y_min_list, 
             "xmax": x_max_list, 
             "ymax": y_max_list}
    
    entry_aux_df= pd.DataFrame.from_dict(entry)
    entry_aux_df["path_name"] = path
    entry_aux_df.set_index("filename",inplace=True)
    
    entry_df = pd.concat([entry_df, entry_aux_df])
    



In [104]:
entry_df["xmin_norm"] = entry_df["xmin"]/entry_df["width"]
entry_df["ymin_norm"] = entry_df["ymin"]/entry_df["height"]
entry_df["box_width"] = (entry_df["xmax"] - entry_df["xmin"])/entry_df["width"]
entry_df["box_height"] = (entry_df["ymax"] - entry_df["ymin"])/entry_df["height"]


In [105]:
entry_df.head()

Unnamed: 0_level_0,class,height,width,xmax,xmin,ymax,ymin,path_name,xmin_norm,ymin_norm,box_width,box_height
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
100_SSM65_orig.jpg,melanoma,192.0,192.0,132.0,66.0,108.0,69.0,C:\Users\martar\OneDrive\Documents\Python cour...,0.34375,0.359375,0.34375,0.203125
100_SSM65_origaffine_image.jpg,melanoma,192.0,192.0,151.0,50.0,115.0,55.0,C:\Users\martar\OneDrive\Documents\Python cour...,0.260417,0.286458,0.526042,0.3125
100_SSM65_origcrop_image.jpg,melanoma,192.0,192.0,133.0,42.0,142.0,79.0,C:\Users\martar\OneDrive\Documents\Python cour...,0.21875,0.411458,0.473958,0.328125
100_SSM65_origfliphor_image.jpg,melanoma,192.0,192.0,125.0,59.0,108.0,69.0,C:\Users\martar\OneDrive\Documents\Python cour...,0.307292,0.359375,0.34375,0.203125
100_SSM65_origflip_image.jpg,melanoma,192.0,192.0,132.0,66.0,122.0,83.0,C:\Users\martar\OneDrive\Documents\Python cour...,0.34375,0.432292,0.34375,0.203125


***STEP 4: Shuffle and then save the train and test files***
We need to shuffle to prevent the network from learning 

In [106]:
entry_df.reindex(np.random.permutation(entry_df.index))


Unnamed: 0_level_0,class,height,width,xmax,xmin,ymax,ymin,path_name,xmin_norm,ymin_norm,box_width,box_height
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
44_D24_origrot90_image.jpg,notmelanoma,192.0,192.0,111.0,90.0,120.0,73.0,C:\Users\martar\OneDrive\Documents\Python cour...,0.468750,0.380208,0.109375,0.244792
19_48_origcrop_image.jpg,notmelanoma,192.0,192.0,191.0,0.0,191.0,40.0,C:\Users\martar\OneDrive\Documents\Python cour...,0.000000,0.208333,0.994792,0.786458
11_SSM10_origflip_image.jpg,melanoma,192.0,192.0,149.0,27.0,149.0,32.0,C:\Users\martar\OneDrive\Documents\Python cour...,0.140625,0.166667,0.635417,0.609375
5_25_orig.jpg,notmelanoma,192.0,192.0,152.0,39.0,111.0,58.0,C:\Users\martar\OneDrive\Documents\Python cour...,0.203125,0.302083,0.588542,0.276042
18_46_origaffine_image.jpg,notmelanoma,192.0,192.0,162.0,61.0,136.0,55.0,C:\Users\martar\OneDrive\Documents\Python cour...,0.317708,0.286458,0.526042,0.421875
80_D57_2_orig.jpg,notmelanoma,192.0,192.0,140.0,56.0,115.0,88.0,C:\Users\martar\OneDrive\Documents\Python cour...,0.291667,0.458333,0.437500,0.140625
85_SSM40_origflip_image.jpg,melanoma,192.0,192.0,137.0,77.0,115.0,71.0,C:\Users\martar\OneDrive\Documents\Python cour...,0.401042,0.369792,0.312500,0.229167
118_SSM9_origrot90_image.jpg,melanoma,192.0,192.0,111.0,83.0,122.0,74.0,C:\Users\martar\OneDrive\Documents\Python cour...,0.432292,0.385417,0.145833,0.250000
40_SSM5_origrot45_image.jpg,melanoma,192.0,192.0,137.0,0.0,191.0,0.0,C:\Users\martar\OneDrive\Documents\Python cour...,0.000000,0.000000,0.713542,0.994792
52_NM61_origflip_image.jpg,melanoma,192.0,192.0,160.0,59.0,109.0,79.0,C:\Users\martar\OneDrive\Documents\Python cour...,0.307292,0.411458,0.526042,0.156250


In [107]:
test_list = [filename for filename in entry_df.index.values if 'test' in filename]
train_list = [filename for filename in entry_df.index.values if 'test' not in filename]

In [108]:
X_train = entry_df.loc[train_list]
X_test = entry_df.loc[test_list]


In [109]:
X_train.shape, X_test.shape

((1148, 12), (42, 12))

In [110]:
#now we have to save those files in a folder call training and ensure we are loading the files into that directory
X_train.to_csv('C:\\Users\\martar\\OneDrive\\Documents\\Python course\\ADS\\Final Project\\preprocessed_data\\train_labels.csv')
X_test.to_csv('C:\\Users\\martar\\OneDrive\\Documents\\Python course\\ADS\\Final Project\\preprocessed_data\\test_labels.csv')