In [1]:
import cv2
import numpy as np
import os

In [2]:
base_dir = "../../assets/datasets/odir5k"

for folder, subfolders, files in os.walk(base_dir):
    for subf in subfolders:
        print(os.path.join(folder, subf))

../../assets/datasets/odir5k\ODIR-5K
../../assets/datasets/odir5k\preprocessed_images
../../assets/datasets/odir5k\ODIR-5K\ODIR-5K
../../assets/datasets/odir5k\ODIR-5K\ODIR-5K\Testing Images
../../assets/datasets/odir5k\ODIR-5K\ODIR-5K\Training Images


In [3]:
train_path = "../../assets/datasets/odir5k/ODIR-5K/ODIR-5K/Training Images"
test_path = "../../assets/datasets/odir5k/ODIR-5K/ODIR-5K/Testing Images"
disc_path = "../../assets/datasets/odir5k/DiscardedImages.csv"
annot_path = "../../assets/datasets/odir5k/full_df.csv"
train_files = sorted(os.listdir(train_path))
test_files = sorted(os.listdir(test_path))

print('train: ', len(train_files))
print('test:', len(test_files))

train:  7000
test: 1000


In [4]:
# helper function

import shutil

def clear_content(directory):
    for file in os.listdir(directory):
        file_path = os.path.join(directory, file)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as exc:
            raise("Failed to remove: %s %s" % (file_path, exc))

In [5]:
# helper classes
import PIL
from PIL import Image
import tensorflow as tf
from skimage import exposure

class ImageCrop:
    '''Cropping Images'''
    def __init__(self, source_folder, destination_folder, file_name):
        self.source_folder = source_folder
        self.destination_folder = destination_folder
        self.file_name = file_name
        
    def remove_black_pixels(self):
        file = os.path.join(self.source_folder, self.file_name)
        image = cv2.imread(file)
        
        # mask of colored pixels
        mask = image > 0
        
        # coorindates of colored pixels
        coordinates = np.argwhere(mask)
        
        # binding box of non-black pixels
        x0, y0, s0 = coordinates.min(axis=0)
        x1, y1, s1 = coordinates.max(axis=0) + 1
        
        # crop contents of bounding box
        cropped = image[x0:x1, y0:y1]
        
        file_cropped = os.path.join(self.destination_folder, self.file_name)
        cv2.imwrite(file_cropped, cropped)
    
    
class ImageResizer:
    '''Resize Image'''
    def __init__(self, image_width, quality, source_folder, 
                 destination_folder, file_name, keep_aspect_ratio):
        self.image_width = image_width
        self.quality = quality
        self.source_folder = source_folder
        self.destination_folder = destination_folder
        self.file_name = file_name
        self.keep_aspect_ratio = keep_aspect_ratio
        
    def run(self):
        """Runst the image library using the constructor arguments.
        Args:
            No arguments required.
        Returns:
            Saves the treated image into a separate folder.
        """

        # We load the original file, we resize it to a smaller width and corresponding height and
        # also mirror the image when we find a right eye image so they are all left eyes.
        
        file = os.path.join(self.source_folder, self.file_name)
        img = Image.open(file)
        if self.keep_aspect_ratio:
            # it will have the exact same width-to-height ratio as the original
            width_perc = (self.image_width / float(img.size[0]))
            height_size = int((float(img.size[1] * float(width_perc))))
            img = img.resize((self.image_width, height_size), PIL.Image.ANTIALIAS)
        else:
            # This will force the image to be square
            img = img.resize((self.image_width, self.image_width), PIL.Image.ANTIALIAS)
        if "right" in self.file_name:
            img.transpose(Image.FLIP_LEFT_RIGHT).save(os.path.join(self.destination_folder, 
                                                                   self.file_name), 
                                                      optimize=True, quality=self.quality)
        else:
            img.save(os.path.join(self.destination_folder, self.file_name))
            


**Image Cropping - Training**

In [7]:
# image croppping job - training
def crop_all_images(source_folder,destination_folder):
    files = [f for f in os.listdir(source_folder) if os.path.isfile(os.path.join(source_folder, f))]
    for file in files:
        if file not in os.listdir(destination_folder):
            ImageCrop(source_folder, destination_folder, file).remove_black_pixels()

source_folder = train_path
destination_folder = "ODIR-5K_Training_Dataset_cropped"

if os.path.exists(destination_folder):
    if len(os.listdir(destination_folder)) < 500:
        clear_content(destination_folder)
else:
    os.mkdir(destination_folder)

crop_all_images(source_folder, destination_folder)

**Image Cropping - Testing**

In [29]:
# image croppping job - testing
def crop_all_images(source_folder,destination_folder):
    files = [f for f in os.listdir(source_folder) if os.path.isfile(os.path.join(source_folder, f))]
    for file in files:
        if file not in os.listdir(destination_folder):
            ImageCrop(source_folder, destination_folder, file).remove_black_pixels()

source_folder = test_path
destination_folder = "ODIR-5K_Testing_Dataset_cropped"

if os.path.exists(destination_folder):
    if len(os.listdir(destination_folder)) < 500:
        clear_content(destination_folder)
else:
    os.mkdir(destination_folder)

crop_all_images(source_folder, destination_folder)

**Image Treatment Training**

In [9]:
# image treatment training
def resize_all_images(source_folder, destination_folder, image_width, quality, keep_aspect_ratio):
    files = [f for f in os.listdir(source_folder) if os.path.isfile(os.path.join(source_folder, f))]
    for file in files:
        if file not in os.listdir(destination_folder):
            ImageResizer(image_width, quality, source_folder, destination_folder, file, keep_aspect_ratio).run()
    
image_width = 224
keep_aspect_ratio = False
quality = 100
source_folder = "ODIR-5K_Training_Dataset_cropped"
destination_folder = "ODIR-5K_Training_Dataset_treated" + '_' + str(image_width)

if os.path.exists(destination_folder):
    if len(os.listdir(destination_folder)) < 500:
        clear_content(destination_folder)
else:
    os.mkdir(destination_folder)
    
resize_all_images(source_folder, destination_folder, image_width, quality, keep_aspect_ratio)

**Image Treatment Testing**

In [31]:
# image treatment testing
def resize_all_images(source_folder, destination_folder, image_width, quality, keep_aspect_ratio):
    files = [f for f in os.listdir(source_folder) if os.path.isfile(os.path.join(source_folder, f))]
    for file in files:
        if file not in os.listdir(destination_folder):
            ImageResizer(image_width, quality, source_folder, destination_folder, file, keep_aspect_ratio).run()
    
image_width = 224
keep_aspect_ratio = False
quality = 100
source_folder = "ODIR-5K_Testing_Dataset_cropped"
destination_folder = "ODIR-5K_Testing_Dataset_treated" + '_' + str(image_width)

if os.path.exists(destination_folder):
    if len(os.listdir(destination_folder)) < 500:
        clear_content(destination_folder)
else:
    os.mkdir(destination_folder)
    
resize_all_images(source_folder, destination_folder, image_width, quality, keep_aspect_ratio)

**Data Augmentation**

In [6]:
import tensorflow as tf
import numpy as np
from skimage import exposure

class ImageTreatment:
    """Used for Augmenting Images"""
    def __init__(self, image_size):
        self.image_size = image_size
    
    def scaling(self, image, scale_vector):
        # Resize to 4-D vector
        image = np.reshape(image, (1, self.image_size, self.image_size, 3))
        boxes = np.zeros((len(scale_vector), 4), dtype=np.float32)
        for index, scale in enumerate(scale_vector):
            x1 = y1 = 0.5 - 0.5 * scale
            x2 = y2 = 0.5 + 0.5 * scale
            boxes[index] = np.array([y1, x1, y2, x2], dtype=np.float32)
        box_ind = np.zeros((len(scale_vector)), dtype=np.int32)
        crop_size = np.array([self.image_size, self.image_size], dtype=np.int32)
        
        output = tf.image.crop_and_resize(image, boxes, box_ind, crop_size)
        output = np.array(output, dtype=np.uint8)
        return output
    
    def brightness(self, image, delta):
        output = tf.iamge.adjust_brightness(image, delta)
        output = np.array(output, dtype=np.uint8)
        return output
    
    def contrast(self, image, contrast_factor):
        output = tf.image.adjust_contrast(image, contrast_factor)
        output = np.array(output, dtype=np.uint8)
        return output
    
    def saturation(self, image, saturation_factor):
        output = tf.image.adjust_saturation(image, saturation_factor)
        output = np.array(output, dtype=np.uint8)
        return output
    
    def hue(self, image, delta):
        output = tf.image.adjust_hue(image, delta)
        output = np.array(output, dtype=np.uint8)
        return output
    
    def central_crop(self, image, central_fraction):
        output = tf.image.central_crop(image, central_fraction)
        output = np.array(output, dtype=np.uint8)
        return output
    
    def crop_to_bounding_box(self, image, offset_height, offset_width, target_height, target_width):
        output = tf.image.crop_to_bounding_box(image, offset_height, offset_width, target_height, target_width)
        output = tf.image.resize(output, (self.image_size, self.image_size))
        output = np.array(output, dtype=np.uint8)
        return output
    
    def gamma(self, image, gamma):
        output = tf.image.adjust_gamma(image, gamma)
        output = np.array(output, dtype=np.uint8)
        return output
    
    def rot90(self, image, k):
        output = tf.image.rot90(image, k)
        output = np.array(output, dtype=np.uint8)
        return output
    
    def rescale_intensity(self, image):
        p2, p98 = np.percentile(image, (2, 98))
        img_rescale = exposure.rescale_intensity(image, in_range=(p2, p98))
        return img_rescale
    
    def equalize_histogram(self, image):
        img_eq = exposure.equalize_hist(image)
        return img_eq
    
    def equalize_adapthist(self, image):
        img_adapted = exposure.equalize_adapthist(image, clipLimit=0.03)
        return img_adapted

In [20]:
from tqdm import tqdm

class DataAugmentationStrategy:
    """Augmentation Strategy - Manual Aug"""
    def __init__(self, image_size, file_name):
        self.base_image = file_name
        self.treatment = ImageTreatment(image_size)
        self.file_path = "ODIR-5k_Training_Dataset_treated_" + str(image_size)
        self.saving_path = "ODIR-5k_Training_Dataset_augmented_" + str(image_size)
        self.file_id = file_name.replace('.jpg', '')
        
        
    def save_image(self, original_vector, image, sample):
        central = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        file = self.file_id + '_' + str(sample) + '.jpg'
        

        file_name = os.path.join(self.saving_path, file)
        exists = os.path.isfile(file_name)
        if exists:
            print('duplicate file found: ' + file_name)
            
        status = cv2.imwrite(file_name, central)
        
        if os.path.exists('ground_truth'):
            if len(os.listdir('ground_truth')) < 1:
                clear_content('groud_truth')
        else:
            os.mkdir('ground_truth')
            
        with open(r'ground_truth/odir_augmented.csv', 'a', newline='') as csv_file:
            file_writer = csv.writer(csv_file, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            file_writer.writerow([file, original_vector[1], original_vector[2], original_vector[3], original_vector[4],
                                 original_vector[5], original_vector[6], original_vector[7], original_vector[8]])
        
    def generate_images(self, number_samples, original_vector, weights):
        eye_image = os.path.join(self.file_path, self.base_image)
        image = cv2.imread(eye_image)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        original_image = image
        saved = 0
        
        # For any repeating elements, just give the other output
        # We are only expecting up to 3 repetitions
        if weights == 20:
            original_image = self.treatment.rot90(original_image, 2)
        if weights == 400:
            original_image = self.treatment.rot90(original_image, 3)
        if weights > 401:
            print(str(self.file_id) + ' samples:' + str(number_samples))
            raise ValueError('this cannot happen')

        # for the sample type 14, just generate 1 image and leave the method
        if number_samples == 14:
            central = self.treatment.rot90(original_image, 1)
            self.save_image(original_vector, central, weights+14)
            saved = saved +1
            return saved

        if number_samples > 0:
            central = self.treatment.crop_to_bounding_box(original_image, 0, 0, 112, 112)
            self.save_image(original_vector, central, weights+0)
            saved = saved + 1

        if number_samples > 1:
            central = self.treatment.crop_to_bounding_box(original_image, 112, 0, 112, 112)
            self.save_image(original_vector, central, weights+1)
            saved = saved + 1

        if number_samples > 2:
            central = self.treatment.crop_to_bounding_box(original_image, 0, 112, 112, 112)
            self.save_image(original_vector, central, weights+2)
            saved = saved + 1

        if number_samples > 3:
            central = self.treatment.crop_to_bounding_box(original_image, 112, 112, 112, 112)
            self.save_image(original_vector, central, weights+3)
            saved = saved + 1

        if number_samples > 4:
            vector = [0.50]
            central = self.treatment.scaling(original_image, vector)
            self.save_image(original_vector, central[0], weights+4)
            saved = saved + 1

        if number_samples > 5:
            vector = [0.70]
            central = self.treatment.scaling(original_image, vector)
            self.save_image(original_vector, central[0], weights+5)
            saved = saved + 1

        if number_samples > 6:
            vector = [0.80]
            central = self.treatment.scaling(original_image, vector)
            self.save_image(original_vector, central[0], weights+6)
            saved = saved + 1

        if number_samples > 7:
            vector = [0.90]
            central = self.treatment.scaling(original_image, vector)
            self.save_image(original_vector, central[0], weights+7)
            saved = saved + 1

        if number_samples > 8:
            central = self.treatment.rescale_intensity(original_image)
            self.save_image(original_vector, central, weights+8)
            saved = saved + 1

        if number_samples > 9:
            central = self.treatment.contrast(original_image, 2)
            self.save_image(original_vector, central, weights+9)
            saved = saved + 1

        if number_samples > 10:
            central = self.treatment.saturation(original_image, 0.5)
            self.save_image(original_vector, central, weights+10)
            saved = saved + 1

        if number_samples > 11:
            central = self.treatment.gamma(original_image, 0.5)
            self.save_image(original_vector, central, weights+11)
            saved = saved + 1

        if number_samples > 12:
            central = self.treatment.hue(original_image, 0.2)
            self.save_image(original_vector, central, weights+12)
            saved = saved + 1

        return saved

class GroundTruthFiles:
    def __init__(self):
        self.amd = []
        self.cataract = []
        self.diabetes = []
        self.glaucoma = []
        self.hypertension = []
        self.myopia = []
        self.others = []

    def populate_vectors(self, ground_truth_file):
        with open(ground_truth_file) as csvDataFile:
            csv_reader = csv.reader(csvDataFile)

            for row in csv_reader:
                column_id = row[0]
                normal = row[1]
                diabetes = row[2]
                glaucoma = row[3]
                cataract = row[4]
                amd = row[5]
                hypertension = row[6]
                myopia = row[7]
                others = row[8]
                # just discard the first row
                if column_id != "ID":
#                     print("Processing image: " + column_id.split('_')[0] + "_left.jpg")
                    if diabetes == '1':
                        self.diabetes.append([column_id, normal, diabetes, glaucoma, cataract, amd, hypertension, myopia, others])
                    if glaucoma == '1':
                        self.glaucoma.append([column_id, normal, diabetes, glaucoma, cataract, amd, hypertension, myopia, others])
                    if cataract == '1':
                        self.cataract.append([column_id, normal, diabetes, glaucoma, cataract, amd, hypertension, myopia, others])
                    if amd == '1':
                        self.amd.append([column_id, normal, diabetes, glaucoma, cataract, amd, hypertension, myopia, others])
                    if hypertension == '1':
                        self.hypertension.append([column_id, normal, diabetes, glaucoma, cataract, amd, hypertension, myopia, others])
                    if myopia == '1':
                        self.myopia.append([column_id, normal, diabetes, glaucoma, cataract, amd, hypertension, myopia, others])
                    if others == '1':
                        self.others.append([column_id, normal, diabetes, glaucoma, cataract, amd, hypertension, myopia, others])


In [22]:
# run augmentation strategy
from absl import app
import csv

def write_header():
    with open(r'ground_truth\odir_augmented.csv', 'w', newline='') as csv_file:
        file_writer = csv.writer(csv_file, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        file_writer.writerow(['ID', 'Normal', 'Diabetes', 'Glaucoma', 'Cataract', 'AMD', 'Hypertension',
                              'Myopia', 'Others'])
        return file_writer


def process_files(images, cache, files):
    total = 0
    for strategy in range(len(images)):
        images_to_process = images[strategy][0]
        samples_per_image = images[strategy][1]
        for image_index in range(images_to_process):
            image_vector = files[image_index]
            file_name = image_vector[0]

            # Only check during the first strategy
            if strategy == 0:
                if file_name not in cache:
                    cache[file_name] = 1
                else:
                    cache[file_name] = cache[file_name] * 20

            # print('Processing: ' + file_name)
            augment = DataAugmentationStrategy(image_size, file_name)
            count = augment.generate_images(samples_per_image, image_vector, cache[file_name])
            total = total + count
    return total


def main(csv_path):
    # load the ground truth file
    files = GroundTruthFiles()
    files.populate_vectors(csv_path)

    print('files record count order by size ASC')
    print('hypertension ' + str(len(files.hypertension)))
    print('myopia ' + str(len(files.myopia)))
    print('cataract ' + str(len(files.cataract)))
    print('amd ' + str(len(files.amd)))
    print('glaucoma ' + str(len(files.glaucoma)))
    print('others ' + str(len(files.others)))
    print('diabetes ' + str(len(files.diabetes)))

    images_hypertension = [[len(files.hypertension), 13], [128, 14]]
    images_myopia = [[len(files.myopia), 9], [196, 14]]
    images_cataract = [[len(files.cataract), 9], [66, 14]]
    images_amd = [[len(files.amd), 9], [16, 14]]
    images_glaucoma = [[len(files.glaucoma), 7], [312, 14]]
    images_others = [[len(files.others), 1], [568, 14]]
    images_diabetes = [[1038, 1]]

    # Delete previous file
    exists = os.path.isfile(r'ground_truth\odir_augmented.csv')
    if exists:
        os.remove(r'ground_truth\odir_augmented.csv')

    write_header()

    images_processed = {}

    total_hypertension = process_files(images_hypertension, images_processed, files.hypertension)
    total_myopia = process_files(images_myopia, images_processed, files.myopia)
    total_cataract = process_files(images_cataract, images_processed, files.cataract)
    total_amd = process_files(images_amd, images_processed, files.amd)
    total_glaucoma = process_files(images_glaucoma, images_processed, files.glaucoma)
    total_others = process_files(images_others, images_processed, files.others)
    total_diabetes = process_files(images_diabetes, images_processed, files.diabetes)

    print("total generated hypertension: " + str(total_hypertension))
    print("total generated myopia: " + str(total_myopia))
    print("total generated cataract: " + str(total_cataract))
    print("total generated amd: " + str(total_amd))
    print("total generated glaucoma: " + str(total_glaucoma))
    print("total generated others: " + str(total_others))
    print("total generated diabetes: " + str(total_diabetes))

image_size = 224
save_path = "ODIR-5k_Training_Dataset_augmented_" + str(image_size)

csv_path = 'ground_truth/odir.csv'
main(csv_path)

files record count order by size ASC
hypertension 192
myopia 262
cataract 275
amd 280
glaucoma 313
others 1124
diabetes 1778
total generated hypertension: 2624
total generated myopia: 2554
total generated cataract: 2541
total generated amd: 2536
total generated glaucoma: 2503
total generated others: 1692
total generated diabetes: 1038


In [41]:
# odir_patients_to_numpy.py

from absl import app
import logging
import logging.config
import time
import csv
import cv2
import os
import numpy as np
import glob


class NumpyDataGenerator:
    def __init__(self, training_path, testing_path, csv_path, csv_testing_path, augmented_path, csv_augmented_file):
        self.training_path = training_path
        self.testing_path = testing_path
        self.csv_path = csv_path
        self.csv_testing_path = csv_testing_path
#         self.logger = logging.getLogger('odir')
        self.total_records_training = 0
        self.total_records_testing = 0
        self.csv_augmented_path = csv_augmented_file
        self.augmented_path = augmented_path

    def npy_training_files(self, file_name_training, file_name_training_labels):
        training = []
        training_labels = []

#         self.logger.debug("Opening CSV file")
        with open(self.csv_path) as csvDataFile:
            csv_reader = csv.reader(csvDataFile)
            self.total_records_training = 0
            for row in csv_reader:
                column_id = row[0]
                normal = row[1]
                diabetes = row[2]
                glaucoma = row[3]
                cataract = row[4]
                amd = row[5]
                hypertension = row[6]
                myopia = row[7]
                others = row[8]
                # just discard the first row
                if column_id != "ID":
#                     self.logger.debug("Processing image: " + column_id)
                    # load first the image from the folder
                    eye_image = os.path.join(self.training_path, column_id)
                    image = cv2.imread(eye_image)
                    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                    training.append(image)
                    training_labels.append([normal, diabetes, glaucoma, cataract, amd, hypertension, myopia, others])
                    self.total_records_training = self.total_records_training + 1

        training = np.array(training, dtype='uint8')
        training_labels = np.array(training_labels, dtype='uint8')
        # convert (number of images x height x width x number of channels) to (number of images x (height * width *3))
        # for example (6069 * 28 * 28 * 3)-> (6069 x 2352) (14,274,288)
        training = np.reshape(training, [training.shape[0], training.shape[1], training.shape[2], training.shape[3]])

        # save numpy array as .npy formats
        np.save(file_name_training, training)
#         self.logger.debug("Saving NPY File: " + file_name_training)
        np.save(file_name_training_labels, training_labels)
#         self.logger.debug("Saving NPY File: " + file_name_training_labels)
#         self.logger.debug("Closing CSV file")

    def npy_testing_files(self, file_name_testing, file_name_testing_labels):
        testing = []
        testing_labels = []

#         self.logger.debug("Opening CSV file")
        with open(self.csv_testing_path) as csvDataFile:
            csv_reader = csv.reader(csvDataFile)
            self.total_records_testing = 0
            for row in csv_reader:
                column_id = row[0]
                normal = row[1]
                diabetes = row[2]
                glaucoma = row[3]
                cataract = row[4]
                amd = row[5]
                hypertension = row[6]
                myopia = row[7]
                others = row[8]
                # just discard the first row
                if column_id != "ID":
#                     self.logger.debug("Processing image: " + column_id + "_left.jpg")
                    # load first the image from the folder
                    eye_image = os.path.join(self.testing_path, column_id + "_left.jpg")
                    image = cv2.imread(eye_image)
                    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                    testing.append(image)
                    testing_labels.append([normal, diabetes, glaucoma, cataract, amd, hypertension, myopia, others])
                    self.total_records_testing = self.total_records_testing + 1

#                     self.logger.debug("Processing image: " + column_id + "_right.jpg")
                    eye_image = os.path.join(self.testing_path, column_id + "_right.jpg")
                    image = cv2.imread(eye_image)
                    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                    testing.append(image)
                    testing_labels.append([normal, diabetes, glaucoma, cataract, amd, hypertension, myopia, others])
                    self.total_records_testing = self.total_records_testing + 1

        testing = np.array(testing, dtype='uint8')
        training_labels = np.array(testing_labels, dtype='uint8')
        # convert (number of images x height x width x number of channels) to (number of images x (height * width *3))
        # for example (6069 * 28 * 28 * 3)-> (6069 x 2352) (14,274,288)
        testing = np.reshape(testing, [testing.shape[0], testing.shape[1], testing.shape[2], testing.shape[3]])

        # save numpy array as .npy formats
        np.save(file_name_testing, testing)
#         self.logger.debug("Saving NPY File: " + file_name_testing)
        np.save(file_name_testing_labels, training_labels)
#         self.logger.debug("Saving NPY File: " + file_name_testing_labels)
#         self.logger.debug("Closing CSV file")

    def npy_training_files_split(self, split_number, file_name_training, file_name_training_labels, file_name_testing,
                                 file_name_testing_labels):
        training = []
        training_labels = []
        testing = []
        testing_labels = []

#         self.logger.debug("Opening CSV file")
        count = 0
        with open(self.csv_path) as csvDataFile:
            csv_reader = csv.reader(csvDataFile)
            self.total_records_training = 0
            self.total_records_testing = 0
            for row in csv_reader:
                column_id = row[0]
                label = row[1]
                # just discard the first row
                if column_id != "ID":
#                     self.logger.debug("Processing image: " + column_id)
                    # load first the image from the folder
                    eye_image = os.path.join(self.training_path, column_id)
                    image = cv2.imread(eye_image)
                    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                    if count < split_number:
                        testing.append(image)
                        testing_labels.append(label)
                        self.total_records_testing = self.total_records_testing + 1
                    else:
                        training.append(image)
                        training_labels.append(label)
                        self.total_records_training = self.total_records_training + 1
                    count = count + 1

        testing = np.array(testing, dtype='uint8')
        testing_labels = np.array(testing_labels, dtype='uint8')
        testing = np.reshape(testing, [testing.shape[0], testing.shape[1], testing.shape[2], testing.shape[3]])

        # save numpy array as .npy formats
        np.save(file_name_testing, testing)
        np.save(file_name_testing_labels, testing_labels)

        training = np.array(training, dtype='uint8')
        training_labels = np.array(training_labels, dtype='uint8')
        # convert (number of images x height x width x number of channels) to (number of images x (height * width *3))
        # for example (6069 * 28 * 28 * 3)-> (6069 x 2352) (14,274,288)
        training = np.reshape(training, [training.shape[0], training.shape[1], training.shape[2], training.shape[3]])

        # save numpy array as .npy formats
        np.save(file_name_training, training)
#         self.logger.debug("Saving NPY File: " + file_name_training)
        np.save(file_name_training_labels, training_labels)
#         self.logger.debug("Saving NPY File: " + file_name_training_labels)
#         self.logger.debug("Closing CSV file")

    def is_sickness(self, row, sickness):
        switcher = {
            "normal": row[1] == '1' and row[2] == '0' and row[3] == '0' and row[4] == '0' and row[5] == '0' and row[
                6] == '0' and row[7] == '0' and row[8] == '0',
            "diabetes": row[1] == '0' and row[2] == '1' and row[3] == '0' and row[4] == '0' and row[5] == '0' and row[
                6] == '0' and row[7] == '0' and row[8] == '0',
            "glaucoma": row[1] == '0' and row[2] == '0' and row[3] == '1' and row[4] == '0' and row[5] == '0' and row[
                6] == '0' and row[7] == '0' and row[8] == '0',
            "cataract": row[1] == '0' and row[2] == '0' and row[3] == '0' and row[4] == '1' and row[5] == '0' and row[
                6] == '0' and row[7] == '0' and row[8] == '0',
            "amd": row[1] == '0' and row[2] == '0' and row[3] == '0' and row[4] == '0' and row[5] == '1' and row[
                6] == '0' and row[7] == '0' and row[8] == '0',
            "hypertension": row[1] == '0' and row[2] == '0' and row[3] == '0' and row[4] == '0' and row[5] == '0' and
                            row[6] == '1' and row[7] == '0' and row[8] == '0',
            "myopia": row[1] == '0' and row[2] == '0' and row[3] == '0' and row[4] == '0' and row[5] == '0' and row[
                6] == '0' and row[7] == '1' and row[8] == '0',
            "others": row[1] == '0' and row[2] == '0' and row[3] == '0' and row[4] == '0' and row[5] == '0' and row[
                6] == '0' and row[7] == '0' and row[8] == '1'
        }
        return switcher.get(sickness, False)

    def npy_training_files_split_all(self, split_number, file_name_training, file_name_training_labels,
                                     file_name_testing,
                                     file_name_testing_labels, include_augmented):
        split_factor = 10820
        training = []
        training_labels = []
        training_2 = []
        training_labels_2 = []
        testing = []
        testing_labels = []
        images_used = []
        count_images = 0

        class_names = ['normal', 'diabetes', 'glaucoma', 'cataract', 'amd',
                       'hypertension', 'myopia', 'others']

#         self.logger.debug("Opening CSV file")

        class_count = {'normal': 0, 'diabetes': 0, 'glaucoma': 0, 'cataract': 0, 'amd': 0, 'hypertension': 0,
                       'myopia': 0, 'others': 0}
        split_pocket = split_number / 8
        with open(self.csv_path) as csvDataFile:
            csv_reader = csv.reader(csvDataFile)
            self.total_records_training = 0
            self.total_records_testing = 0
            for row in csv_reader:
                column_id = row[0]
                normal = row[1]
                diabetes = row[2]
                glaucoma = row[3]
                cataract = row[4]
                amd = row[5]
                hypertension = row[6]
                myopia = row[7]
                others = row[8]
                # just discard the first row
                if column_id != "ID":
#                     self.logger.debug("Processing image: " + column_id)
                    # load first the image from the folder
                    eye_image = os.path.join(self.training_path, column_id)
                    image = cv2.imread(eye_image)
                    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                    found = False
                    for sickness in class_names:
                        if self.is_sickness(row, sickness) and class_count[sickness] < split_pocket:
                            testing.append(image)
                            images_used.append(row[0] + ',' + sickness + ',' + str(class_count[sickness]))
                            testing_labels.append([normal, diabetes, glaucoma, cataract, amd, hypertension, myopia, others])
                            self.total_records_testing = self.total_records_testing + 1
                            class_count[sickness] = class_count[sickness] + 1
                            found = True
#                             logger.debug('found ' + sickness + ' ' + str(class_count[sickness]))

                    if not found:
                        training.append(image)
                        training_labels.append([normal, diabetes, glaucoma, cataract, amd, hypertension, myopia, others])
                        self.total_records_training = self.total_records_training + 1
                        count_images = count_images + 1

        if include_augmented:
            with open(self.csv_augmented_path) as csvDataFile:
                csv_reader = csv.reader(csvDataFile)
                for row in csv_reader:
                    column_id = row[0]
                    normal = row[1]
                    diabetes = row[2]
                    glaucoma = row[3]
                    cataract = row[4]
                    amd = row[5]
                    hypertension = row[6]
                    myopia = row[7]
                    others = row[8]
                    # just discard the first row
                    if column_id != "ID":
#                         self.logger.debug("Processing image: " + column_id)
                        # load first the image from the folder
                        eye_image = os.path.join(self.augmented_path, column_id)
                        image = cv2.imread(eye_image)
                        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                        if count_images >= split_factor:
                            training_2.append(image)
                            training_labels_2.append([normal, diabetes, glaucoma, cataract, amd, hypertension, myopia, others])
                        else:
                            training.append(image)
                            training_labels.append([normal, diabetes, glaucoma, cataract, amd, hypertension, myopia, others])
                        self.total_records_training = self.total_records_training + 1
                        count_images = count_images + 1

        testing = np.array(testing, dtype='uint8')
        testing_labels = np.array(testing_labels, dtype='uint8')
        testing = np.reshape(testing, [testing.shape[0], testing.shape[1], testing.shape[2], testing.shape[3]])

        # save numpy array as .npy formats
        np.save(file_name_testing, testing)
        np.save(file_name_testing_labels, testing_labels)

        training = np.array(training, dtype='uint8')
        training_labels = np.array(training_labels, dtype='uint8')
        # convert (number of images x height x width x number of channels) to (number of images x (height * width *3))
        # for example (6069 * 28 * 28 * 3)-> (6069 x 2352) (14,274,288)
        training = np.reshape(training, [training.shape[0], training.shape[1], training.shape[2], training.shape[3]])

        # convert (number of images x height x width x number of channels) to (number of images x (height * width *3))
        # for example (6069 * 28 * 28 * 3)-> (6069 x 2352) (14,274,288)
        if include_augmented:
            training_2 = np.array(training_2, dtype='uint8')
            training_labels_2 = np.array(training_labels_2, dtype='uint8')
            training_2 = np.reshape(training_2, [training_2.shape[0], training_2.shape[1], training_2.shape[2], training_2.shape[3]])

#         self.logger.debug(testing.shape)
#         self.logger.debug(testing_labels.shape)
#         self.logger.debug(training.shape)
#         self.logger.debug(training_labels.shape)
#         if include_augmented:
#             self.logger.debug(training_2.shape)
#             self.logger.debug(training_labels_2.shape)

        # save numpy array as .npy formats
        np.save(file_name_training + '_1', training)
        np.save(file_name_training_labels + '_1', training_labels)
        if include_augmented:
            np.save(file_name_training + '_2', training_2)
            np.save(file_name_training_labels + '_2', training_labels_2)
#         self.logger.debug("Closing CSV file")
#         for sickness in class_names:
#             self.logger.debug('found ' + sickness + ' ' + str(class_count[sickness]))
        csv_writer = csv.writer(open("files_used.csv", 'w', newline=''))
        for item in images_used:
#             self.logger.debug(item)
            entries = item.split(",")
            csv_writer.writerow(entries)

In [43]:
def main():
    start = time.time()
    image_width = 224
    training_path = r'ODIR-5K_Training_Dataset_treated' + '_' + str(image_width)
    testing_path = r'ODIR-5K_Testing_Dataset_treated' + '_' + str(image_width)
    augmented_path = r'ODIR-5K_Training_Dataset_augmented' + '_' + str(image_width)
    csv_file = r'ground_truth\odir.csv'
    csv_augmented_file = r'ground_truth\odir_augmented.csv'
    training_file = r'ground_truth\testing_default_value.csv'
    generator = NumpyDataGenerator(training_path, testing_path, csv_file, training_file, augmented_path,
                                   csv_augmented_file)

    # Generate testing file
    generator.npy_testing_files('odir_testing_challenge' + '_' + str(image_width), 'odir_testing_labels_challenge' + '_' + str(image_width))

    # Generate training file
    generator.npy_training_files('odir_training', 'odir_training_labels')
    generator.npy_training_files_split(1000, 'odir_training',
    'odir_training_labels', 'odir_testing', 'odir_testing_labels')

    generator.npy_training_files_split_all(400, 'odir_training' + '_' + str(image_width),
                                           'odir_training_labels' + '_' + str(image_width),
                                           'odir_testing' + '_' + str(image_width),
                                           'odir_testing_labels' + '_' + str(image_width),
                                           True)
    end = time.time()
    
main()
print("done")

done
