In [None]:
import os
import glob

import warnings
warnings.filterwarnings("ignore")

import math
import numpy as np
import pandas as pd
from skimage import io
import matplotlib.pyplot as plt

import tensorflow
from tensorflow.keras import backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras import optimizers, losses, metrics
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import *

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay

from imgaug import augmenters as iaa


In [None]:
def compute_class_weights(df, mu=0.15):

    value_counts = df['Label'].value_counts().to_dict()
    total = sum(value_counts.values())
    keys = value_counts.keys()

    class_weight = dict()
    
    for key in keys:
        
        score = math.log(mu*total/float(value_counts[key]))
        class_weight[key] = score if score > 1.0 else 1.0
    
    return class_weight

In [None]:
coarse_classes = {
    'LC_Fol': 'Live_Coral', 
    'LC_Branch': 'Live_Coral', 
    'Ar_TA': 'Artificial', 
    'LC_Por': 'Live_Coral', 
    'UDC_CCA': 'Dead_Coral', 
    'Ro_TA': 'Rock', 
    'Pa_FA': 'Pavement', 
    'Mu_Ba': 'Mud', 
    'CR_TA': 'Coral_Rubble', 
    'Pa_Cy': 'Pavement', 
    'LC_Encr': 'Live_Coral', 
    'Sa_Ba': 'Sand', 
    'Pa_H': 'Pavement', 
    'Pa_TA': 'Pavement', 
    'UDC_H': 'Dead_Coral', 
    'CR_FA': 'Coral_Rubble', 
    'Ro_CCA': 'Rock', 
    'Mu_Cy': 'Mud', 
    'UDC_TA': 'Dead_Coral', 
    'Sa_Cy': 'Sand', 
    'UDC_FA': 'Dead_Coral', 
    'Ro_H': 'Rock', 
    'Sa_FA': 'Sand', 
    'Pa_CCA': 'Pavement', 
    'Ro_Ba': 'Rock', 
    'Ro_Cy': 'Rock', 
    'Sa_TA': 'Sand', 
    'Ar_Ba': 'Artificial', 
    'UDC_Cy': 'Dead_Coral', 
    'Ar_FA': 'Artificial', 
    'CR_CCA': 'Coral_Rubble'
}

functional_groups = {
    'LC_Fol': 'Hard_Coral', 
    'LC_Branch': 'Hard_Coral', 
    'Ar_TA': 'Algae', 
    'LC_Por': 'Hard_Coral', 
    'UDC_CCA': 'Algae', 
    'Ro_TA': 'Algae', 
    'Pa_FA': 'Algae', 
    'Mu_Ba': 'Algae', 
    'CR_TA': 'Algae', 
    'Pa_Cy': 'Other', 
    'LC_Encr': 'Hard_Coral', 
    'Sa_Ba': 'Algae', 
    'Pa_H': 'Algae', 
    'Pa_TA': 'Algae', 
    'UDC_H': 'Algae', 
    'CR_FA': 'Algae', 
    'Ro_CCA': 'Algae', 
    'Mu_Cy': 'Other', 
    'UDC_TA': 'Algae', 
    'Sa_Cy': 'Other', 
    'UDC_FA': 'Algae', 
    'Ro_H': 'Algae', 
    'Sa_FA': 'Algae', 
    'Pa_CCA': 'Algae', 
    'Ro_Ba': 'Algae', 
    'Ro_Cy': 'Other', 
    'Sa_TA': 'Algae', 
    'Ar_Ba': 'Algae', 
    'UDC_Cy': 'Other', 
    'Ar_FA': 'Algae', 
    'CR_CCA': 'Algae'
}

short_codes = [
    'LC_Fol', 
    'LC_Branch', 
    'Ar_TA', 
    'LC_Por', 
    'UDC_CCA', 
    'Ro_TA', 
    'Pa_FA', 
    'Mu_Ba', 
    'CR_TA', 
    'Pa_Cy', 
    'LC_Encr', 
    'Sa_Ba', 
    'Pa_H', 
    'Pa_TA', 
    'UDC_H', 
    'CR_FA', 
    'Ro_CCA', 
    'Mu_Cy', 
    'UDC_TA', 
    'Sa_Cy', 
    'UDC_FA', 
    'Ro_H', 
    'Sa_FA', 
    'Pa_CCA', 
    'Ro_Ba', 
    'Ro_Cy', 
    'Sa_TA', 
    'Ar_Ba', 
    'UDC_Cy', 
    'Ar_FA', 
    'CR_CCA'
]

In [None]:
DATA_PATH = "/home/azureuser/cloudfiles/code/Users/jordan.pierce/Data/Guam_Saipan/3653/"
assert os.path.exists(DATA_PATH)

EXP_DIR = "Experiments/"
EXP_NAME = "Short_Codes_With_Weights_Even"
EXP_FOLDER = EXP_DIR + EXP_NAME + "/"
WEIGHTS_DIR = EXP_FOLDER + "Weights/"
LOGS_DIR = EXP_FOLDER + "Logs/"

weighted = False if "No_Weights" in EXP_NAME else True

os.makedirs(EXP_DIR, exist_ok=True)
os.makedirs(EXP_FOLDER, exist_ok=True)
os.makedirs(WEIGHTS_DIR, exist_ok=True) 
os.makedirs(LOGS_DIR, exist_ok=True) 

In [None]:
data = pd.read_csv(DATA_PATH + "Updated_CNet_Annotations.csv", index_col=0)

# List of Image Names
image_names = data['Image'].unique().tolist()
print("Number Images: ", len(image_names))

if "Coarse" in EXP_NAME:
    data["Label"].replace(coarse_classes, inplace=True)
elif "Functional" in EXP_NAME:
    data["Label"].replace(functional_groups, inplace=True)
else:
    pass

# All class categories in the dataset
print("Class Categories: ", len(short_codes))

# Getting a sample of each class category
sample = pd.DataFrame()

for category in short_codes:
    c = data[data['Label'] == category].sample(n=1)
    sample = pd.concat((sample, c))

In [None]:
threshold_met = False
seed = 0

while not threshold_met:

    # Split the Images into training, validation, and test sets.
    training_images, testing_images = train_test_split(image_names, test_size=0.35, random_state=seed)
    validation_images, testing_images = train_test_split(testing_images, test_size=0.5, random_state=seed)

    train = data[data['Image'].isin(training_images)]
    valid = data[data['Image'].isin(validation_images)]
    test = data[data['Image'].isin(testing_images)]

    train = pd.concat((train, sample))
    valid = pd.concat((valid, sample))
    test = pd.concat((test, sample))

    # Concatenate the three dataframes
    combined_df = pd.concat([train, valid, test])

    # Calculate class percentages for the combined dataframe
    grouped_df = combined_df.groupby("Label")
    class_counts = grouped_df.size()
    class_percentages_combined = class_counts / len(combined_df) * 100

    # Calculate class percentages for each of the three dataframes
    grouped_train = train.groupby("Label")
    class_counts_train = grouped_train.size()
    class_percentages_train = class_counts_train / len(train) * 100

    grouped_valid = valid.groupby("Label")
    class_counts_valid = grouped_valid.size()
    class_percentages_valid = class_counts_valid / len(valid) * 100

    grouped_test = test.groupby("Label")
    class_counts_test = grouped_test.size()
    class_percentages_test = class_counts_test / len(test) * 100

    # Calculate mean squared error between class percentages for each pair of dataframes
    mse_train_valid = mean_squared_error(class_percentages_train, class_percentages_valid)
    mse_train_test = mean_squared_error(class_percentages_train, class_percentages_test)
    mse_valid_test = mean_squared_error(class_percentages_valid, class_percentages_test)

    # Set the threshold to be the average of the MSE values
    threshold = 1.05 * (mse_train_valid + mse_train_test + mse_valid_test) / 3

    # Determine if the class distributions are close enough for your use case
    if mse_train_valid <= threshold and mse_train_test <= threshold and mse_valid_test <= threshold:

        threshold_met = True
        print("Class distributions are similar between all three dataframes.")

        # Print the MSE values
        print("MSE between train and valid: {:.4f}".format(mse_train_valid))
        print("MSE between train and test: {:.4f}".format(mse_train_test))
        print("MSE between valid and test: {:.4f}".format(mse_valid_test))

    else:
        seed += 1



plt.figure(figsize=(20,10))
plt.subplot(1,3,1)
plt.title("Train: " + str(len(train)))
ax = train['Label'].value_counts().plot(kind='bar')
ax.set_xticklabels(short_codes)

plt.subplot(1,3,2)
plt.title("Valid: " + str(len(valid)))
ax = valid['Label'].value_counts().plot(kind='bar')
ax.set_xticklabels(short_codes)

plt.subplot(1,3,3)
plt.title("Test: " + str(len(test)))
ax = test['Label'].value_counts().plot(kind='bar')
ax.set_xticklabels(short_codes)
plt.savefig(EXP_FOLDER + "DatasetSplit.png")
plt.show()

# List of Class Categories
print("Train Class Categories: ", len(short_codes))
print("Validation Class Categories: ", len(valid['Label'].unique().tolist()))
print("Test Class Categories: ", len(test['Label'].unique().tolist()))

In [None]:
# Augmentation methods implemented using imgaug; training augmentations should be 
# more intense, whereas the validation and testing augmentations should be minimal to none.

# Setting the amount of dropout for our model (form of data augmentation)
dropout_rate = 0.80

augs_for_train = iaa.Sequential([   
                          iaa.Resize(224, interpolation = 'linear'),
                          iaa.Fliplr(0.5),
                          iaa.Flipud(0.5),
                          iaa.Rot90([1, 2, 3, 4], True),
                          iaa.Sometimes(.3, iaa.Affine(scale = (.95, 1.05))),
                          iaa.Sometimes(.1, iaa.Invert(1.0)),
                          iaa.Sometimes(.5, iaa.SomeOf((0, 1), 
                                             [
                                                 iaa.MedianBlur(3),
                                                 iaa.ChannelShuffle(.7),
                                                 iaa.EdgeDetect(.5)
                                             ])),

                          iaa.Sometimes(.5, iaa.SomeOf((0, 1),
                                            [
                                                 iaa.Dropout(.2),
                                                 iaa.ImpulseNoise(.2),
                                                 iaa.SaltAndPepper(.2)
                                            ]))
                       ])


augs_for_valid = iaa.Sequential([iaa.Resize(224, interpolation = 'linear')])

In [None]:
# Data generators are made to take the patch file paths currently stored in the dataframes; generators
# create an augmentation pipeline so that patches can be read, augmented, and normalized on-the-fly 
# while training.

# Number of epochs to train for
num_epochs = 15

# Batch size is dependent on the amount of memory available on your machine
batch_size = 32

# Defines the length of an epoch, all images are used
steps_per_epoch_train = len(train)/batch_size
steps_per_epoch_valid = len(valid)/batch_size

# Learning rate 
lr = .0001

# Training images are augmented, and then normalized
train_augmentor = ImageDataGenerator(preprocessing_function = augs_for_train.augment_image)
                                     
                                                                   
# Reading from dataframe
train_generator = train_augmentor.flow_from_dataframe(dataframe = train, 
                                                      directory = None,
                                                      x_col = 'Patch_Name', 
                                                      y_col = 'Label', 
                                                      target_size = (224, 224), 
                                                      color_mode = "rgb",  
                                                      class_mode = 'categorical', 
                                                      batch_size = batch_size,
                                                      shuffle = True, 
                                                      seed = 42)
                                                     
# Only normalize images, no augmentation
validate_augmentor = ImageDataGenerator( preprocessing_function = augs_for_valid.augment_image)

# Reading from dataframe                             
validation_generator = validate_augmentor.flow_from_dataframe(dataframe = valid,
                                                              directory = None, 
                                                              x_col = 'Patch_Name', 
                                                              y_col = 'Label', 
                                                              target_size = (224, 224), 
                                                              color_mode = "rgb",  
                                                              class_mode = 'categorical', 
                                                              batch_size = batch_size, 
                                                              shuffle = True, 
                                                              seed = 42)

In [None]:
# Now we create the model!

convnet = tensorflow.keras.applications.convnext.ConvNeXtTiny(
        model_name='convnext_tiny',
        include_top=False,
        include_preprocessing=True,
        weights='imagenet',
        input_shape=(224, 224, 3),
        pooling='max',
        classes=len(short_codes),
        classifier_activation='softmax',
)

model = Sequential([
        convnet,
        Dropout(dropout_rate),
        Dense(len(short_codes)),
        Activation('softmax')
])

# Display the model architecture
if True:
    model.summary()



In [None]:
# Defining the Recall and Precision metric functions

def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

In [None]:
# Defining training callbacks, such as learning rate, which will reduce after two epochs by %65 if the validation loss 
# does not decrease. Only the epochs with lower validation loss values will be saved.

callbacks = [
                ReduceLROnPlateau(monitor = 'val_loss', factor = .65, patience = 5, verbose = 1),
                 
                ModelCheckpoint(filepath = WEIGHTS_DIR + 'model-{epoch:03d}-{acc:03f}-{val_acc:03f}.h5', 
                                 monitor='val_loss', save_weights_only = True, save_best_only = False, verbose = 1),

                EarlyStopping(monitor="val_loss", min_delta=0, patience=10, verbose=0,  mode="auto", baseline=None,
                             restore_best_weights=True, start_from_epoch=0)

            ]

In [None]:
# sets the loss function, optimizier and metrics, probably don't need to change
# except maybe the learing rate 

model.compile(loss = 'categorical_crossentropy',
              optimizer = optimizers.Adam(learning_rate=0.00001), 
              metrics=['acc', precision_m, recall_m])

In [None]:
# Calculate the class weights, plot and save figure
if weighted:
    class_weight = compute_class_weights(train)
else:
    class_weight = {c: 1.0 for c in short_codes}

plt.figure(figsize=(30,5))
plt.bar(class_weight.keys(), class_weight.values())
plt.title("ClassWeight")
plt.savefig(EXP_FOLDER + "ClassWeight.png")
plt.show()

# Reformat for model.fit()
class_weight = {short_codes.index(k): v for (k, v) in class_weight.items()}

In [None]:
# Train the model, logs the results of the training in history

history = model.fit(train_generator, 
                    steps_per_epoch = steps_per_epoch_train, 
                    epochs = num_epochs, 
                    validation_data = validation_generator, 
                    validation_steps = steps_per_epoch_valid,
                    callbacks = callbacks,
                    verbose = 1,
                    class_weight = class_weight)  

In [None]:
# After training, loads the best weights
weights = sorted(glob.glob(WEIGHTS_DIR + "*.h5"), key=os.path.getmtime)
[print(w, i) for i, w in enumerate(weights)];

In [None]:
best_weights = weights[3]
print("Best Weights: ", best_weights)
model.load_weights(best_weights)

In [None]:
# Reads from dataframe for test set
test_generator = validate_augmentor.flow_from_dataframe(dataframe=test, 
                                                        x_col = 'Patch_Name', 
                                                        y_col = 'Label', 
                                                        target_size = (224, 224), 
                                                        color_mode = "rgb",  
                                                        class_mode = 'categorical', 
                                                        batch_size = batch_size, 
                                                        shuffle = False, 
                                                        seed = 42)
# Grab the ground-truth
test_y = test_generator.classes

In [None]:
# Defines the length of an epoch
steps_per_epoch_test = len(test)//1

# Use the model to predict on all of the test set
predictions = model.predict_generator(test_generator, steps=steps_per_epoch_test)

# Collapse the probability distribution to the most likely category
predict_classes = np.argmax(predictions, axis = 1)

In [None]:
len(test_y), len(predict_classes)

In [None]:
print("# of images:", len(predict_classes))

# Create the confusion matrix between the ground-truth and predicted
cm = confusion_matrix(y_true=test_y, y_pred=predict_classes)

# Create a display for the confusion matrix, providing the labels
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=short_codes)

# Calculate the overall accuracy
overall_accuracy = accuracy_score(y_true=test_y,
                                  y_pred=predict_classes)

# Calculate the accuracy per class category, store in a dict
class_accuracy = cm.diagonal()/cm.sum(axis=1)
class_accuracy = dict(zip(short_codes, class_accuracy))

# Write the accuracy per class category to a .csv file
df = pd.DataFrame(list(zip(class_accuracy.keys(), class_accuracy.values())),
                  columns=['Class', 'Accuracy'])

df.to_csv(EXP_FOLDER + "ClassAccuracy.csv")

# Plot the results
fig, ax = plt.subplots(figsize=(30, 30))
plt.title("Overall Accuracy :" + str(overall_accuracy))
disp.plot(ax=ax)
plt.savefig(EXP_FOLDER + "ConfusionMatrix.png")
print("Class Accuracy: ", df)

In [None]:
# Higher values represents more sure/confident predictions
# .1 unsure -> .5 pretty sure -> .9 very sure

# Creating a graph of the threshold values and the accuracy
# useful for determing how sure the model is when making predictions

threshold_values = np.arange(0.0, 1.0, 0.05)
class_ACC = []

for threshold in threshold_values:
    sure_index = []

    for i in range(0, len(predictions)):
        if( (sorted(predictions[i])[-1]) - (sorted(predictions[i])[-2]) > threshold):
            sure_index.append(i)

    sure_test_y = np.take(test_y, sure_index, axis = 0)
    sure_pred_y = np.take(predict_classes, sure_index)

    class_ACC.append(accuracy_score(sure_test_y, sure_pred_y)) 

plt.figure(figsize=(10, 5))
plt.plot(threshold_values, class_ACC)
plt.xlabel('Threshold Values')
plt.xlim([0, 1])
plt.xticks(ticks = np.arange(0, 1.05, 0.1))
plt.ylabel('Classification Accuracy')
plt.title('Identifying the ideal threshold value')
plt.savefig(EXP_FOLDER + "AccuracyThreshold.png")
plt.show()

In [None]:
model.save(WEIGHTS_DIR + "Best_Model_and_Weights.h5")
model.layers[0].save_weights(WEIGHTS_DIR + 'pre_trained_convnet_weights_3361.h5')