In [None]:
# IMPORTS
import tensorflow as tf 
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve 

In [None]:
# PREPROCESSING

# DEFINING DIRECTORIES
common_path = "/Users/michaelpatsais/Documents/Uni_work/machine_learning/deep-learning-2024/Project/msl-images/"
save_directory = ""

idg = ImageDataGenerator(rescale = 1./255.)

# LABELS DF
image_classes = np.loadtxt(common_path + "msl_synset_words-indexed.txt", dtype = str, delimiter = ",")
labels_df = pd.DataFrame(image_classes, columns = ["filename", "class"])

# RELABELING CLASSES into those with/without rover
no_rover = [5, 8, 9, 22] # classes without rover
for i, line in labels_df.iterrows(): # for each row
    if int(line["filename"]) in no_rover: # if the classification is in the list
        labels_df.at[i, "filename"] = "0" # the classification for that line is changed to 0
        labels_df.at[i, "class"] = "no_rover" # the name of the classification is changed to no_rover
    else:
        labels_df.at[i, "filename"] = "1" # otherwise the classification is changed to 1
        labels_df.at[i, "class"] = "rover" # the name of the classification is changed to rover

labels = labels_df["class"].unique().tolist()
labels = [x.strip() for x in labels]
print(labels)

classes = np.arange(len(labels))
classes = [str(x) for x in classes]
print(classes)

# train dataset
train = np.loadtxt(common_path + "train-calibrated-shuffled.txt", dtype = str, delimiter = " ")
train_df = pd.DataFrame(train, columns = ["filename", "class"])

for i, line in train_df.iterrows():
    if int(line["class"]) in no_rover:
        train_df.at[i, "class"] = "0"
    else:
        train_df.at[i, "class"] = "1"

train_ds = idg.flow_from_dataframe(dataframe = train_df, directory = common_path, classes = classes, batch_size = 100)

# VAL dataset
val = np.loadtxt(common_path + "val-calibrated-shuffled.txt", dtype = str, delimiter = " ")
val_df = pd.DataFrame(val, columns = ["filename", "class"])

for i, line in val_df.iterrows():
    if int(line["class"]) in no_rover:
        val_df.at[i, "class"] = "0"
    else:
        val_df.at[i, "class"] = "1"

val_ds = idg.flow_from_dataframe(dataframe = val_df, directory = common_path, classes = classes, batch_size = 100)

# TEST dataset
test = np.loadtxt(common_path + "test-calibrated-shuffled.txt", dtype = str, delimiter = " ")
test_df = pd.DataFrame(test, columns = ["filename", "class"])

for i, line in test_df.iterrows():
    if int(line["class"]) in no_rover:
        test_df.at[i, "class"] = "0"
    else:
        test_df.at[i, "class"] = "1"

test_ds = idg.flow_from_dataframe(dataframe = test_df, directory = common_path, classes = classes, batch_size = 100)

In [None]:
#  MODEL CONFIGURATION
batch_size = 100
epochs = 10
dropout_val = 0.4

xpix = train_ds[0][0][0].shape[0]
ypix = train_ds[0][0][0].shape[1]
zpix = train_ds[0][0][0].shape[2]

model = tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(64, (5, 5), activation='relu', input_shape=(xpix, ypix, zpix)),
    tf.keras.layers.MaxPooling2D((2, 2)),

    tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D((2, 2)),

    tf.keras.layers.Flatten(),

    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(dropout_val),
    tf.keras.layers.Dense(32, activation='relu'),
    
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.summary()

loss_fn = tf.keras.losses.BinaryCrossentropy() #CatagoricalCrossentropy() means 2 output nodes

model.compile(optimizer='adam', loss=loss_fn, metrics=['accuracy'])

history  = model.fit(train_ds, validation_data=val_ds, batch_size=batch_size, epochs=epochs)

In [None]:
# Model Accuracy & Loss

print("  N(Epochs)        = ", epochs)
print("  accuracy (train) = ", history.history['accuracy'])
print("  accuracy (test)  = ", history.history['val_accuracy'])

plt.plot(history.history['accuracy']) # 'accuracy'
plt.plot(history.history['val_accuracy']) # 'val_accuracy'
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validate'], loc='lower right')
plt.savefig(save_directory + "/3a_Binary_Classification_model_accuracy")
plt.show()

print("  loss (train)     = ", history.history['loss'])
print("  loss (test)      = ", history.history['val_loss'])

# summarize history for loss
plt.plot(history.history['loss']) # 'loss'
plt.plot(history.history['val_loss']) # 'val_loss'
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validate'], loc='upper right')
plt.savefig(save_directory + "/3a_Binary_Classification_model_loss")
plt.show()

In [None]:
# Evaluating Accuracy

# Checking for imbalance in number of images per class in train and test data 
# class_amounts_train = train_df["class"].value_counts().to_dict()
# class_amounts_test = test_df["class"].value_counts().to_dict()
# print(class_amounts_train)
# print(class_amounts_test)

predictions = model.predict(test_ds)

# combining batches
all_images = np.concatenate([test_ds[batch][0] for batch in range(len(test_ds))])
all_images_data = np.concatenate([test_ds[batch][1] for batch in range(len(test_ds))])

FPR, TPR, thresholds = roc_curve(all_images_data[:,1], predictions[:,1])

plt.plot(FPR, TPR)
plt.plot([0,1], [0,1], linestyle = 'dashed')
plt.title('Binary Classification Model ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.savefig(save_directory + "/3b.png")
plt.show()