In [None]:
import os
import gc

import glob

import numpy as np
import pandas as pd
from skimage import io
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

In [None]:

# labels for each class category of interest
LABELS = ['Branching', 'Fish', 'Massive', 'Not Massive', 'Substrate', 'Target', 'Water']

labels = {'Branching' : 0, 
          'Fish' : 1, 
          'Massive' : 2,
          'Not Massive' : 3,
          'Substrate' : 4,
          'Target' : 5,
          'Water' : 6}

In [None]:
# Collects all of the patches, change as needed
data = glob.glob("Patches\\Manual\\**\\*.bmp", recursive = False)
len(data)

In [None]:
data = np.random.choice(data, size = int(len(data) * .25), replace = False)

In [None]:
# takes all of the data and splits it into training, validation and test sets
# provides the correponsding labels for each patch based on the folder it is located in
training_files, validation_files = train_test_split(data, test_size = .1)
validation_files, test_files = train_test_split(validation_files, test_size = .1)

training_labels = [file.split("\\")[-2] for file in training_files]
validation_labels = [file.split("\\")[-2] for file in validation_files]
test_labels = [file.split("\\")[-2] for file in test_files]


train = pd.DataFrame(data = list(zip(training_files, training_labels)), columns = ['images', 'labels'])
valid = pd.DataFrame(data = list(zip(validation_files, validation_labels)), columns = ['images', 'labels'])
test = pd.DataFrame(data = list(zip(test_files, test_labels)), columns = ['images', 'labels'])

len(train), len(valid), len(test)

In [None]:
from imgaug import augmenters as iaa

# Augmentation methods
augs_for_train = iaa.Sequential([   iaa.Resize(224, interpolation = 'linear'),
                          iaa.Fliplr(0.5),
                          iaa.Flipud(0.5),
                          iaa.Rot90([1, 2, 3, 4], True),
                          iaa.Sometimes(.3, iaa.Affine(scale = (.95, 1.05))),
                          iaa.Sometimes(.1, iaa.Invert(1.0)),
                          iaa.Sometimes(.5, iaa.SomeOf((0, 1), 
                                             [
                                                 iaa.MedianBlur(3),
                                                 iaa.ChannelShuffle(.7),
                                                 iaa.EdgeDetect(.5)
                                             ])),

                          iaa.Sometimes(.5, iaa.SomeOf((0, 1),
                                            [
                                                 iaa.Dropout(.2),
                                                 iaa.ImpulseNoise(.2),
                                                 iaa.SaltAndPepper(.2)
                                            ]))
                       ])


augs_for_valid = iaa.Sequential([iaa.Resize(224, interpolation = 'linear')])

In [None]:
# Data generators to take the files in the dataframes previously created, and creates a pipeline
# Patches are augmented and rescaled, and then during training, validation, testing are fed directly
# to the model
#
# Batch size is dependent on the amount of memory available on your machine
import tensorflow as tf
import keras

from keras.preprocessing.image import ImageDataGenerator

batch_size = 32

# Training images are augmented, and then lightly pre-processed
train_augmentor = ImageDataGenerator(preprocessing_function = augs_for_train.augment_image,
                                     rescale = 1.0/255.0)
                                     
                                                                   
# Reading from dataframe, can save augmented images if needed
train_generator = train_augmentor.flow_from_dataframe(dataframe = train, directory = None,
                                                      x_col = 'images', y_col = 'labels', target_size = (224, 224), 
                                                      color_mode = "rgb",  class_mode = 'categorical', 
                                                      batch_size = batch_size, shuffle = True, seed = 42)
                                                     


# Only pre-process images, no augmentation
validate_augmentor = ImageDataGenerator( preprocessing_function = augs_for_valid.augment_image,
                                         rescale = 1.0/255.0 )

# Reading from dataframe                             
validation_generator = validate_augmentor.flow_from_dataframe(dataframe = valid, directory = None, 
                                                              x_col = 'images', y_col = 'labels', target_size = (224, 224), 
                                                              color_mode = "rgb",  class_mode = 'categorical', 
                                                              batch_size = batch_size, shuffle = True, seed = 42)

In [None]:
# Defines the length of an epoch, all images used
steps_per_epoch_train = len(train)/batch_size

# Defines the length of an epoch, all images used
steps_per_epoch_valid = len(valid)/batch_size

In [None]:
# creates the model, starts with noise-student weights
# find the efficentnet repo here:
# https://github.com/qubvel/efficientnet
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.applications.nasnet import NASNetMobile
import efficientnet.keras as efn 

model = Sequential([
        efn.EfficientNetB0(weights = 'noisy-student', include_top = False,  pooling = 'max'),
        Dropout(.80),
        Dense(7),
        Activation('softmax')
])

In [None]:
model.summary()

In [None]:
# Defining some metrics
from keras import optimizers, losses, metrics
from keras import backend as K

def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

In [None]:
# Defining some callbacks, learning rate will reduce every 2 epochs by * .65
# if the validation loss does not decrease. Only the weights from the epoch with
# the lowest validation loss will be saved.
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau

holla = [
         ReduceLROnPlateau(monitor = 'val_loss', factor = .65, patience = 2, verbose = 1),
         ModelCheckpoint(filepath = 'path_to_weights.h5', 
                         monitor='val_loss', save_weights_only = True, 
                         save_best_only = True, verbose = 1),
        ]


In [None]:
# sets the loss function, optimizier and metrics, probably don't need to change
# except maybe the learing rate 

model.compile(loss = 'categorical_crossentropy',
              optimizer = optimizers.Adam(lr = .0001), 
              metrics=['acc', precision_m, recall_m])

In [None]:
# traing the model, logs the results of the training in history
history = model.fit_generator(train_generator, 
                              steps_per_epoch = steps_per_epoch_train, 
                              epochs = 100, 
                              validation_data = validation_generator, 
                              validation_steps = steps_per_epoch_valid,
                              callbacks = holla,
                              verbose = 1)  

In [None]:
# After training, loads the best weights
model.load_weights('path_to_labels.h5')

In [None]:
# Reads from dataframe for test set
test_generator = validate_augmentor.flow_from_dataframe(dataframe = test, 
                                                 x_col = 'images', y_col = 'labels', target_size = (224, 224), 
                                                 color_mode = "rgb",  class_mode = 'categorical', 
                                                 batch_size = batch_size, shuffle = False, seed = 42)
# Defines the length of an epoch
steps_per_epoch_test = len(test)/batch_size

In [None]:
# provides a confusion matrix of the results
# classification accuracy should be above 97%
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
from sklearn.model_selection import cross_val_score

# Results, stores predictions for thresholding, shuffling needs to stay off for test
predictions = model.predict_generator(test_generator, steps = steps_per_epoch_test)
predict_classes = np.argmax(predictions, axis = 1)

test_y = test_generator.classes
print("# of images:", len(predict_classes))
print(accuracy_score(y_true = test_y, y_pred = predict_classes))
print(confusion_matrix(y_true = test_y, y_pred = predict_classes))

In [None]:
# Higher values represents more sure/confident predictions
# .1 unsure -> .5 pretty sure -> .9 very sure

# Look at creating a graph of the threshold values and the accuracy
# useful for determing how sure the model is when making predictions

threshold_values = np.arange(0.0, 1.0, 0.05)
class_ACC = []

for threshold in threshold_values:
    sure_index = []

    for i in range(0, len(predictions)):
        if( (sorted(predictions[i])[-1]) - (sorted(predictions[i])[-2]) > threshold):
            sure_index.append(i)

    sure_test_y = np.take(test_y, sure_index, axis = 0)
    sure_pred_y = np.take(predict_classes, sure_index)

    class_ACC.append(accuracy_score(sure_test_y, sure_pred_y)) 

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.plot(threshold_values, class_ACC)
plt.xlabel('Threshold Values')
plt.xlim([0, 1])
plt.xticks(ticks = np.arange(0, 1.05, 0.1))
plt.ylabel('Classification Accuracy')
plt.title('Identifying the ideal threshold value')
plt.show()