In [None]:
import tensorflow as tf
import os
import collections
import time
import random
import re
import json
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import pickle
import math

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from glob import glob
from PIL import Image
from tqdm import tqdm

#Custom Classes
from prep import Preparation
from gru_decoder import GRU_Decoder
from gru_encoder import GRU_Encoder
from cnn_encoder import CNN_Encoder

In [None]:
prep = Preparation()

In [None]:
image_model = tf.keras.applications.ResNet50(include_top=False, weights='imagenet')
image_input = image_model.input
hidden_layer = image_model.layers[-1].output
image_features_extract_model = tf.keras.Model(image_input, hidden_layer)

In [None]:
with open("../dataset/best_captions.json", "r") as jf:
    data = json.loads(jf.read())

target_data = data['target_paths']
distractor_data = data['distractor_paths']
caption_data = data['best_captions']

In [None]:
edited_caption_data = []
for i in range(len(caption_data)):
    edited_caption_data.append(f'<start> {caption_data[i]} <end>')

In [None]:
#Split into train, val, and test set.
val_slice_index = int(len(target_data)*0.8)
test_slice_index = int(len(target_data)*0.9)

targ_name_train, targ_name_val, targ_name_test = target_data[:val_slice_index], target_data[val_slice_index:test_slice_index], target_data[test_slice_index:]

dis_name_train, dis_name_val, dis_name_test = distractor_data[:val_slice_index], distractor_data[val_slice_index:test_slice_index], distractor_data[test_slice_index:]

captions_train, captions_val, captions_test = edited_caption_data[:val_slice_index], edited_caption_data[val_slice_index:test_slice_index], edited_caption_data[test_slice_index:]

In [None]:
img_A = mpimg.imread(f"../dataset/prep_data/{targ_name_test[0]}.jpg")
img_B = mpimg.imread(f"../dataset/prep_data/{dis_name_test[0]}.jpg")
print(captions_test[0])
# display images
fig, ax = plt.subplots(1,2)
ax[0].title.set_text("Target")
ax[0].imshow(img_A)
ax[1].title.set_text("Distractor")
ax[1].imshow(img_B)

In [None]:
#Tokenize the captions
top_v = 45

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=top_v,
                                                  oov_token="<unk>",
                                                  filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')
tokenizer.fit_on_texts(captions_train)
tokenizer.word_index['<pad>'] = 0
tokenizer.index_word[0] = '<pad>'

# Create the tokenized vectors
train_seqs = tokenizer.texts_to_sequences(captions_train)

cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post')

In [None]:
BATCH_SIZE = 32
BUFFER_SIZE = 1000
embedding_dim = 512
units = 512
vocab_size = top_v + 1
num_steps = len(targ_name_train) // BATCH_SIZE 

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((targ_name_train, dis_name_train, cap_vector))

# Use map to load the numpy files in parallel
dataset = dataset.map(lambda item1, item2, item3: tf.numpy_function(
          prep.map_func_oracle, [item1, item2, item3], [tf.float32, tf.float32, tf.int32]))

# Shuffle and batch
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

In [None]:
gru_encoder = GRU_Encoder(embedding_dim, units, vocab_size)
encoder = CNN_Encoder(embedding_dim)

optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.CategoricalCrossentropy()

loss_plot = []

In [None]:
checkpoint_path = "./checkpointslistener/train"
ckpt = tf.train.Checkpoint(encoder=encoder,
                           gru_encoder=gru_encoder,
                           optimizer = optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=30)

In [None]:
# @tf.function
def train_step(m, targ, dist):
    loss = 0

    # initializing the hidden state for each batch
    # because the captions are not related from image to image
    hidden = gru_encoder.reset_state(batch_size=targ.shape[0])

    with tf.GradientTape() as tape:
        #could be a serious problem -- most likely! Exciting
        features_t = encoder(targ)
        features_d = encoder(dist)

        v = gru_encoder(m, hidden)
        
        with tape.stop_recording():
            rand_n = random.random()

            if rand_n > 0.5:
                left = features_t
                right = features_d
                y_t = tf.convert_to_tensor([[1,0]]*targ.shape[0],dtype=tf.float32)
            else:
                left = features_d
                right = features_t
                y_t = tf.convert_to_tensor([[0,1]]*targ.shape[0],dtype=tf.float32)
    
        x = tf.norm(tf.keras.layers.dot([left, v],axes=2,normalize=True),axis=(1,2))
        y = tf.norm(tf.keras.layers.dot([right, v],axes=2,normalize=True),axis=(1,2))
        x = tf.reshape(x, (x.shape[0],1))
        y = tf.reshape(y, (y.shape[0],1))
        z = tf.concat([x,y],axis=1)
        y_p = tf.nn.softmax(z)
        loss = loss_object(y_t, y_p)

    trainable_variables = encoder.trainable_variables + gru_encoder.trainable_variables
    
    gradients = tape.gradient(loss, trainable_variables)
    optimizer.apply_gradients(zip(gradients, trainable_variables))

    return loss

In [None]:
EPOCHS = 20

for epoch in range(0, EPOCHS):
    start = time.time()
    total_loss = 0

    for (batch, (targ_tensor, dis_tensor, target)) in enumerate(dataset):
        target = tf.one_hot(target, vocab_size)
        loss = train_step(target, targ_tensor, dis_tensor)
        total_loss += loss
        
        if batch % 100 == 0:
          print ('Epoch {} Batch {} Loss {:.4f}'.format(
            epoch + 1, batch, loss.numpy()))
              
    # storing the epoch end loss value to plot later
    loss_plot.append(total_loss / num_steps)

    if epoch % 5 == 0:
      ckpt_manager.save()
      
    print ('Epoch {} Loss {:.6f}'.format(epoch + 1,
                                         total_loss/num_steps))
    print ('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

In [None]:
plt.plot(loss_plot)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss Plot Speaker')
plt.show()

In [None]:
with open("../dataset/captions.json", 'r') as jf:
    data_all = json.loads(jf.read())
data_all.pop('lstm_labels')
img_paths = list(data_all.keys())

In [None]:
def map_func_test(img_name, img_name_2):
  img_tensor = np.load('../dataset/prep_data/'+img_name.decode('utf-8')+'.jpg.npy')
  img_tensor_2 = np.load('../dataset/prep_data/'+img_name_2.decode('utf-8')+'.jpg.npy')
  uno = int(img_name)

  return img_tensor, img_tensor_2, uno

def map_func_disc(img_name, img_name_2, cap):
  img_tensor = np.load('../dataset/prep_data/'+img_name.decode('utf-8')+'.jpg.npy')
  img_tensor_2 = np.load('../dataset/prep_data/'+img_name_2.decode('utf-8')+'.jpg.npy')

  return img_tensor, img_tensor_2, cap

In [None]:
with open("../dataset/best_captions.json", "r") as jfec:
    data = json.loads(jfec.read())

target_data = data['target_paths']
distractor_data = data['distractor_paths']

val_slice_index = int(len(target_data)*0.8)
test_slice_index = int(len(target_data)*0.9)

targ_test = target_data[test_slice_index:]

dis_test = distractor_data[test_slice_index:]

testset = tf.data.Dataset.from_tensor_slices((targ_test, dis_test))
testset = testset.map(lambda item1, item2: tf.numpy_function(
          map_func_test, [item1, item2], [tf.float32, tf.float32, tf.int64]))

testset = testset.shuffle(BUFFER_SIZE).batch(1)

with open("../dataset/easy_captions.json", "r") as jfec:
    easy_data = json.loads(jfec.read())

target_easy_data = easy_data['target_paths']
distractor_easy_data = easy_data['distractor_paths']

val_slice_index = int(len(target_easy_data)*0.8)
test_slice_index = int(len(target_easy_data)*0.9)

targ_easy_test = target_easy_data[test_slice_index:]

dis_easy_test = distractor_easy_data[test_slice_index:]

testset_easy = tf.data.Dataset.from_tensor_slices((targ_easy_test, dis_easy_test))
testset_easy = testset_easy.map(lambda item1, item2: tf.numpy_function(
          map_func_test, [item1, item2], [tf.float32, tf.float32, tf.int64]))

testset_easy = testset_easy.shuffle(BUFFER_SIZE).batch(1)

In [None]:
def check_understanding_random(data):
    total_right = 0
    total_wrong = 0

    for (batch, (targ_tensor, dis_tensor, targ_name)) in enumerate(data):
        hidden_l = gru_encoder.reset_state(batch_size=targ_tensor.shape[0])

        #Find a random caption associated with the target
        targ_name = targ_name.numpy()[0]
        captions = data_all[str(targ_name)]
        seed = random.randint(0, len(captions)-1)
        caption = f"<start> {captions[seed]} <end>"
        m = tokenizer.texts_to_sequences([caption])
        m = tf.one_hot(m, vocab_size)

        features_t = encoder(targ_tensor)
        features_d = encoder(dis_tensor)

        left = features_t
        right = features_d

        v = gru_encoder(m, hidden_l)

        x = tf.norm(tf.keras.layers.dot([left, v],axes=2,normalize=True),axis=(1,2))
        y = tf.norm(tf.keras.layers.dot([right, v],axes=2,normalize=True),axis=(1,2))

        mask = tf.math.greater(x, y)
        total_right += np.sum(mask.numpy())
        total_wrong += np.sum(mask.numpy()==False)

    total = total_right + total_wrong

    acc = total_right / total

    return acc, total_right, total_wrong

In [None]:
def check_understanding_disc(data):
    total_right = 0
    total_wrong = 0

    for (batch, (targ_tensor, dis_tensor, cap)) in enumerate(data):
        hidden_l = gru_encoder.reset_state(batch_size=targ_tensor.shape[0])

        m = tf.one_hot(cap, vocab_size)
        features_t = encoder(targ_tensor)
        features_d = encoder(dis_tensor)

        left = features_t
        right = features_d

        v = gru_encoder(m, hidden_l)

        x = tf.norm(tf.keras.layers.dot([left, v],axes=2,normalize=True),axis=(1,2))
        y = tf.norm(tf.keras.layers.dot([right, v],axes=2,normalize=True),axis=(1,2))

        mask = tf.math.greater(x, y)
        total_right += np.sum(mask.numpy())
        total_wrong += np.sum(mask.numpy()==False)

    total = total_right + total_wrong

    acc = total_right / total

    return acc, total_right, total_wrong

In [None]:
acc, t_r, t_w = check_understanding_random(testset)
print(acc, t_r, t_w) 

In [None]:
acc, t_r, t_w = check_understanding_random(testset_easy)
print(acc, t_r, t_w) 

In [None]:
## BEST CAPT ##

with open("../dataset/best_captions.json", "r") as jfec:
    data = json.loads(jfec.read())

target_data = data['target_paths']
distractor_data = data['distractor_paths']
captions_data = data['best_captions']
edited_captions_data = []

for c in captions_data:
    edited_captions_data.append(f'<start> {c} <end>')

test_slice_index = int(len(target_data)*0.9)

targ_test = target_data[test_slice_index:]
dis_test = distractor_data[test_slice_index:]
cap_test = edited_captions_data[test_slice_index:]

train_seqs_2 = tokenizer.texts_to_sequences(cap_test)
cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs_2, padding='post')

testset = tf.data.Dataset.from_tensor_slices((targ_test, dis_test, cap_vector))
testset = testset.map(lambda item1, item2, item3: tf.numpy_function(
          map_func_disc, [item1, item2, item3], [tf.float32, tf.float32, tf.int32]))

testset = testset.shuffle(BUFFER_SIZE).batch(1)