In [None]:
import os

if os.getcwd() is not '/home/jovyan/MLMONDAYS/4_UnsupImageRecog':
    os.chdir('/home/jovyan/MLMONDAYS/4_UnsupImageRecog')
    print(os.getcwd())

In [None]:
if not os.path.exists(os.getcwd()+'/data/tamucc/subset_12class/tamucc_subset_12classes.json'):
    !python download_data.py

In [None]:
#start with a high validation split. If model poor on train data, can decrease
VALIDATION_SPLIT = 0.6

#start small - can increase later with larger hardware
TARGET_SIZE= 400

if TARGET_SIZE==400:
   BATCH_SIZE = 6
elif TARGET_SIZE==224:
   BATCH_SIZE = 16

num_classes = 12 

ims_per_shard = 200

patience = 10

# the number of embedding dims
num_embed_dim = 16 

max_epochs = 100 #400 - this is more like the number you'll actually need (or more), but time in class is limited
lr = 1e-4

n_neighbors = 3


In [None]:
from imports import *

In [None]:
###############################################################
### DATA FUNCTIONS
###############################################################
#-----------------------------------
def get_training_dataset():
    """
    This function will return a batched dataset for model training
    INPUTS: None
    OPTIONAL INPUTS: None
    GLOBAL INPUTS: training_filenames
    OUTPUTS: batched data set object
    """
    return get_batched_dataset(training_filenames)

def get_validation_dataset():
    """
    This function will return a batched dataset for model training
    INPUTS: None
    OPTIONAL INPUTS: None
    GLOBAL INPUTS: validation_filenames
    OUTPUTS: batched data set object
    """
    return get_batched_dataset(validation_filenames)

def get_validation_eval_dataset():
    """
    This function will return a batched dataset for model training
    INPUTS: None
    OPTIONAL INPUTS: None
    GLOBAL INPUTS: validation_filenames
    OUTPUTS: batched data set object
    """
    return get_eval_dataset(validation_filenames)

#-----------------------------------
def get_batched_dataset(filenames):
    """
    "get_batched_dataset"
    This function defines a workflow for the model to read data from
    tfrecord files by defining the degree of parallelism, batch size, pre-fetching, etc
    and also formats the imagery properly for model training
    (assumes mobilenet by using read_tfrecord_mv2)
    INPUTS:
        * filenames [list]
    OPTIONAL INPUTS: None
    GLOBAL INPUTS: BATCH_SIZE, AUTO
    OUTPUTS: tf.data.Dataset object
    """
    option_no_order = tf.data.Options()
    option_no_order.experimental_deterministic = True

    dataset = tf.data.Dataset.list_files(filenames)
    dataset = dataset.with_options(option_no_order)
    dataset = dataset.interleave(tf.data.TFRecordDataset, cycle_length=16, num_parallel_calls=AUTO)
    dataset = dataset.map(read_tfrecord, num_parallel_calls=AUTO)

    dataset = dataset.cache() # This dataset fits in RAM
    #dataset = dataset.repeat()
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(BATCH_SIZE, drop_remainder=True) # drop_remainder will be needed on TPU
    dataset = dataset.prefetch(AUTO) #

    return dataset

In [None]:
#-----------------------------------
def get_train_stuff(num_batches):
    """
    "get_train_stuff"
    This function returns all the images and labels from a tf.data.Dataset
    INPUTS:
        * num_batches [int]
    OPTIONAL INPUTS: None
    GLOBAL INPUTS: None
    OUTPUTS:
        * X_train [list] of ndarray images
        * y_train [list] of integer labels
        * class_idx_to_train_idxs [dict] of indices into each class
    """
    X_train = []
    ytrain = []
    train_ds = get_training_dataset()

    counter = 0
    for imgs,lbls in train_ds.take(num_batches):
      ytrain.append(lbls.numpy())
      for im in imgs:
        X_train.append(im.numpy().astype("float32"))

    X_train = np.array(X_train)
    ytrain = np.hstack(ytrain)

    # get X_train, y_train arrays
    X_train = X_train.astype("float32")
    ytrain = np.squeeze(ytrain)

    # code repurposed from https://keras.io/examples/vision/metric_learning/
    class_idx_to_train_idxs = defaultdict(list)
    for y_train_idx, y in enumerate(ytrain):
        class_idx_to_train_idxs[y].append(y_train_idx)

    return X_train, ytrain, class_idx_to_train_idxs

#-----------------------------------
def get_test_stuff(num_batches):
    """
    "get_test_stuff"
    This function returns all the images and labels from a tf.data.Dataset
    INPUTS:
        * num_batches [int]
    OPTIONAL INPUTS: None
    GLOBAL INPUTS: None
    OUTPUTS:
        * X_test [list] of ndarray images
        * y_test [list] of integer labels
        * class_idx_to_test_idxs [dict] of indices into each class
    """
    X_test = []
    ytest = []
    test_ds = get_validation_dataset()

    counter = 0
    for imgs,lbls in test_ds.take(num_batches):
      ytest.append(lbls.numpy())
      for im in imgs:
        X_test.append(im.numpy())

    X_test = np.array(X_test)
    ytest = np.hstack(ytest)

    # get X_test, y_test arrays
    X_test = X_test.astype("float32")
    ytest = np.squeeze(ytest)

    # code repurposed from https://keras.io/examples/vision/metric_learning/
    class_idx_to_test_idxs = defaultdict(list)
    for y_test_idx, y in enumerate(ytest):
        class_idx_to_test_idxs[y].append(y_test_idx)

    return X_test, ytest, class_idx_to_test_idxs

In [None]:
class AnchorPositivePairs(tf.keras.utils.Sequence):
    """
    # code modified from https://keras.io/examples/vision/metric_learning/
    "AnchorPositivePairs"
    This Class selects an anchor and positive example images from each label class
    INPUTS: None
    OPTIONAL INPUTS: None
    GLOBAL INPUTS: None
    OUTPUTS:
        * x [ndarray]: a pair of example images of each class, (2, num_classes, TARGET_SIZE, TARGET_SIZE, 3)
    """
    def __init__(self, num_batchs):
        self.num_batchs = num_batchs

    def __len__(self):
        return self.num_batchs

    def __getitem__(self, _idx):
        x = np.empty((2, num_classes, TARGET_SIZE, TARGET_SIZE, 3), dtype=np.float32)
        for class_idx in range(num_classes):
            examples_for_class = class_idx_to_train_idxs[class_idx]
            anchor_idx = np.random.choice(examples_for_class)
            positive_idx = np.random.choice(examples_for_class)
            while positive_idx == anchor_idx:
                positive_idx = np.random.choice(examples_for_class)
            x[0, class_idx] = X_train[anchor_idx]
            x[1, class_idx] = X_train[positive_idx]
        return x

In [None]:
## model inputs
json_file = os.getcwd()+os.sep+'data/tamucc/subset_12class/tamucc_subset_12classes.json'

data_path= os.getcwd()+os.sep+"data/tamucc/subset_12class/400"
test_samples_fig = os.getcwd()+os.sep+'results/tamucc_sample_12class_model1_est36samples.png'

cm_filename = os.getcwd()+os.sep+'results/tamucc_sample_12class_model1_cm_val.png'

sample_data_path= os.getcwd()+os.sep+"data/tamucc/subset_12class/sample"

filepath = os.getcwd()+os.sep+'results/tamucc_subset_12class_best_weights_model1.h5'

hist_fig = os.getcwd()+os.sep+'results/tamucc_subset_12class_custom_model1.png'

cm_fig = os.getcwd()+os.sep+'results/tamucc_subset_12class_cm_test.png'

In [None]:
filenames = sorted(tf.io.gfile.glob(data_path+os.sep+'*.tfrec'))

nb_images = ims_per_shard * len(filenames)
print(nb_images)

split = int(len(filenames) * VALIDATION_SPLIT)

training_filenames = filenames[split:]
validation_filenames = filenames[:split]

validation_steps = int(nb_images // len(filenames) * len(validation_filenames)) // BATCH_SIZE
steps_per_epoch = int(nb_images // len(filenames) * len(training_filenames)) // BATCH_SIZE

print(steps_per_epoch)
print(validation_steps)

CLASSES = read_classes_from_json(json_file)

print(CLASSES)

In [None]:
train_ds = get_training_dataset()

val_ds = get_validation_dataset()


### Model fine-tuning

Can we do better? At this point, it would be common to try to ***fine-tune*** the model. This usually involves training for longer at a lower learning rate, in the hope that it will find further optimal solutions in the loss landscape

We could also freeze model layers at this point, so lower layers can no longer learn but higher layers are still free to. However, in the spirit of experimentation (i.e. only varying one variable at a time), we'll leave the model as is, increase the `patience` to 20, lower the learning rate to `5e-5` (i.e. half a magnitude step down in learning rate), and train again with a starting point of the best weights from the previous model training 

In [None]:
del ytest, class_idx_to_test_idxs

X_train, ytrain, class_idx_to_train_idxs = get_train_stuff(num_batches)

In [None]:
json_file = os.getcwd()+os.sep+'data/tamucc/subset_12class/tamucc_subset_12classes.json'

data_path= os.getcwd()+os.sep+"data/tamucc/subset_12class/400"
test_samples_fig = os.getcwd()+os.sep+'results/tamucc_sample_12class_model2_est36samples.png'

cm_filename = os.getcwd()+os.sep+'results/tamucc_sample_12class_model2_cm_val.png'

sample_data_path= os.getcwd()+os.sep+"data/tamucc/subset_12class/sample"

filepath = os.getcwd()+os.sep+'results/tamucc_subset_12class_best_weights_model2.h5'

hist_fig = os.getcwd()+os.sep+'results/tamucc_subset_12class_custom_model2.png'

cm_fig = os.getcwd()+os.sep+'results/tamucc_subset_12class_cm_test_model2.png'

initial_filepath = os.getcwd()+os.sep+'results/weights_copy/tamucc_subset_12class_best_weights_model1.h5'


In [None]:
model2 = get_large_embedding_model(TARGET_SIZE, num_classes, num_embed_dim)

# use a smaller learning rate, because we are fine-tuning
lr = 5e-5

patience = 20

model2.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
     metrics=['accuracy'],
)
earlystop = EarlyStopping(monitor="loss",
                              mode="min", patience=patience)

# set checkpoint file
model_checkpoint = ModelCheckpoint(filepath, monitor='loss',
                                verbose=0, save_best_only=True, mode='min',
                                save_weights_only = True)

callbacks = [model_checkpoint, earlystop]

In [None]:
# load with previous weights
model2.load_weights(initial_filepath)

In [None]:
do_train = False #True

if do_train:
    history1 = model2.fit(AnchorPositivePairs(num_batchs=num_batches), epochs=max_epochs,
                          callbacks=callbacks)

    plt.figure(figsize = (10,10))
    plt.subplot(221)
    plt.plot(history1.history["loss"])
    plt.xlabel('Model training epoch number')
    plt.ylabel('Loss (soft cosine distance)')

    plt.subplot(222)
    plt.plot(history1.history["accuracy"])
    plt.xlabel('Model training epoch number')
    plt.ylabel('Accuracy')
    # plt.show()
    # plt.savefig(hist_fig, dpi=200, bbox_inches='tight')
    # plt.close('all')

else:
    model2.load_weights(filepath)

In [None]:
num_dim_use = num_embed_dim #2

knn3 = fit_knn_to_embeddings(model2, X_train, ytrain, n_neighbors)

knn5 = fit_knn_to_embeddings(model2, X_train, ytrain, 5)

knn7 = fit_knn_to_embeddings(model2, X_train, ytrain, 7)

del X_train, ytrain

In [None]:
X_test, ytest, class_idx_to_test_idxs = get_test_stuff(num_batches)

touse = len(X_test) 

# touse = 300

embeddings_test = model2.predict(X_test[:touse])
embeddings_test = tf.nn.l2_normalize(embeddings_test, axis=-1)
del X_test

In [None]:
y_pred1 = knn3.predict(embeddings_test[:,:num_dim_use])
y_pred2 = knn5.predict(embeddings_test[:,:num_dim_use])
y_pred3 = knn7.predict(embeddings_test[:,:num_dim_use])

y_prob1 = knn3.predict_proba(embeddings_test[:,:num_dim_use])
y_prob2 = knn5.predict_proba(embeddings_test[:,:num_dim_use])
y_prob3 = knn7.predict_proba(embeddings_test[:,:num_dim_use])

In [None]:
score1 = knn3.score(embeddings_test[:,:num_dim_use], ytest[:touse])
score2 = knn5.score(embeddings_test[:,:num_dim_use], ytest[:touse])
score3 = knn7.score(embeddings_test[:,:num_dim_use], ytest[:touse])

print('3-NN score: %f' % score1)
print('5-NN score: %f' % score2)
print('7-NN score: %f' % score3)

In [None]:
mask = np.c_[y_pred1, y_pred2, y_pred3]

use = np.any(mask>.9, axis=1) #only predictions where all probabilities are > 0.9
mask = mask[use,:]

# weighted average - you might decide to use this based on each model's average scores
# y_en = np.round(np.average(mask, axis=1, weights=[.1, .1, .5]))

y_en = np.median(mask, axis=1)

In [None]:
labs = ytest[:touse][use]
preds = y_en

cm = confusion_matrix(labs, preds)

cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

thres=0.1
cm[cm<thres] = 0

plt.figure(figsize=(15,15))
sns.heatmap(cm,
  annot=True,
  cmap = sns.cubehelix_palette(dark=0, light=1, as_cmap=True))

tick_marks = np.arange(len(CLASSES))+.5
plt.xticks(tick_marks, [c.decode() for c in CLASSES], rotation=90,fontsize=12)
plt.yticks(tick_marks, [c.decode() for c in CLASSES],rotation=0, fontsize=12)