In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import pickle

In [2]:
BATCH_SIZE = 256
EPOCHS = 12
LEARNING_RATE = 0.00003
MARGIN = 10
CAPTION_INPUT_SIZE = 300
FRAME_INPUT_SIZE = 500
CAPTION_LATENT_SIZE = 256
FRAME_LATENT_SIZE = 300

In [3]:
# Creating placeholders
caption_placeholder = tf.placeholder(tf.float32, shape = [None, None, CAPTION_INPUT_SIZE])
frame_1_placeholder = tf.placeholder(tf.float32, shape = [None, None, FRAME_INPUT_SIZE])
frame_2_placeholder = tf.placeholder(tf.float32, shape = [None, None, FRAME_INPUT_SIZE])

# Setting GPU config
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = 0.8)
config = tf.ConfigProto(allow_soft_placement = True, gpu_options = gpu_options)
config.gpu_options.allow_growth = True

In [4]:
# Defining the Neural Network Graph for modified Siamese Network
def train_caption_embeddings(x_placeholder, latent_dim):
    cell = tf.nn.rnn_cell.GRUCell(latent_dim, kernel_initializer = tf.contrib.layers.variance_scaling_initializer(), name = 'caption_cells')
    cells = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob = 0.5)
    x, s = tf.nn.dynamic_rnn(cells, x_placeholder, dtype = tf.float32, swap_memory = True)
    x = tf.contrib.layers.batch_norm(x, is_training = True, updates_collections = None)
    x = tf.nn.dropout(x, rate = 0.5)
    print(x.shape)
    x = tf.reshape(x, shape = [-1, 50 * latent_dim])
    print(x.shape)
    x = tf.nn.dropout(x, rate = 0.5)
    x = tf.layers.dense(x, latent_dim, kernel_initializer = tf.contrib.layers.variance_scaling_initializer())
    out = tf.nn.relu(x)
    print(out.shape)
    return out

def train_frame_embeddings(x_placeholder, latent_dim, reuse):
    with tf.compat.v1.variable_scope('var', reuse = reuse):
        cell = tf.nn.rnn_cell.GRUCell(latent_dim, kernel_initializer = tf.contrib.layers.variance_scaling_initializer(), name = 'frame_cells')
        cells = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob = 0.5)
        x, s = tf.nn.dynamic_rnn(cells, x_placeholder, dtype = tf.float32, swap_memory = True)
        x = tf.contrib.layers.batch_norm(x, is_training = True, updates_collections = None)
        x = tf.nn.dropout(x, rate = 0.5)
        print(x.shape)
        x = tf.reshape(x, shape = [-1, 50 * latent_dim])
        print(x.shape)
        x = tf.nn.dropout(x, rate = 0.5)
        x = tf.layers.dense(x, latent_dim, kernel_initializer = tf.contrib.layers.variance_scaling_initializer())
        out = tf.nn.relu(x)
        print(out.shape)
        return out

In [5]:
caption_out = train_caption_embeddings(caption_placeholder, CAPTION_LATENT_SIZE) #Anchor
frame_out_1_full = train_frame_embeddings(frame_1_placeholder, FRAME_LATENT_SIZE, reuse = None) #Positive
frame_out_2_full = train_frame_embeddings(frame_2_placeholder, FRAME_LATENT_SIZE, reuse = True) #Negative

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This class is equivalent as tf.keras.layers.GRUCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
(?, ?, 256)
(?, 12800)
Instructions for updating:
Use keras.layers.Dense instead.
Instructions for upda

In [6]:
# Taking the first CAPTION_LATENT_SIZE from frame latent vector for positive and negatives
frame_out_1 = frame_out_1_full[:, :CAPTION_LATENT_SIZE]
frame_out_2 = frame_out_2_full[:, :CAPTION_LATENT_SIZE]

In [7]:
# Triplet loss
positive_distance = tf.reduce_sum(tf.square(caption_out - frame_out_1), 1)
negative_distance = tf.reduce_sum(tf.square(caption_out - frame_out_2), 1)

loss = tf.reduce_mean(tf.maximum(0., positive_distance - negative_distance + MARGIN))
optimizer = tf.train.AdamOptimizer(learning_rate = LEARNING_RATE)
train = optimizer.minimize(loss)

# init = tf.global_variables_initializer()
# sess = tf.Session()

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [8]:
# Loading Triples
with open(r'/home/login/Paired/train_triples_list.pickle', 'rb') as f:
    train_triples_list = pickle.load(f)

In [9]:
with open(r'/home/login/Paired/val_triples_list.pickle', 'rb') as f:
    val_triples_list = pickle.load(f)

In [10]:
train_anchor = np.stack([each[0] for each in train_triples_list], axis = 0)
train_positive = np.stack([each[1] for each in train_triples_list], axis = 0)
train_negative = np.stack([each[2] for each in train_triples_list], axis = 0)

val_anchor = np.stack([each[0] for each in val_triples_list], axis = 0)
val_positive = np.stack([each[1] for each in val_triples_list], axis = 0)
val_negative = np.stack([each[2] for each in val_triples_list], axis = 0)

In [11]:
train_anchor.shape

(461750, 50, 300)

In [12]:
train_positive.shape

(461750, 50, 500)

In [13]:
train_negative.shape

(461750, 50, 500)

In [14]:
len(train_positive)

461750

In [15]:
val_anchor.shape

(192750, 50, 300)

In [16]:
val_positive.shape

(192750, 50, 500)

In [17]:
val_negative.shape

(192750, 50, 500)

In [18]:
init = tf.global_variables_initializer()
sess = tf.Session()

In [19]:
# Training the model
sess.run(init)
for i in range(EPOCHS):
    for idx in range(0, train_anchor.shape[0], BATCH_SIZE):
        anchor_batch, positive_batch, negative_batch = train_anchor[idx : idx + BATCH_SIZE], train_positive[idx : idx + BATCH_SIZE], train_negative[idx : idx + BATCH_SIZE]
        sess.run(train, feed_dict = {caption_placeholder : anchor_batch, \
                                     frame_1_placeholder : positive_batch, \
                                     frame_2_placeholder : negative_batch})
    
    if i % 1 == 0:
        print("HELLLOOOO")
        tot_loss, val_tot_loss = 0, 0
        for idx in range(0, train_anchor.shape[0], BATCH_SIZE):
            anchor_batch, positive_batch, negative_batch = train_anchor[idx : idx + BATCH_SIZE], train_positive[idx : idx + BATCH_SIZE], train_negative[idx : idx + BATCH_SIZE]
            loss_ = sess.run(loss, feed_dict = {caption_placeholder : anchor_batch, \
                                            frame_1_placeholder : positive_batch, \
                                            frame_2_placeholder : negative_batch})
            tot_loss += loss_
            
        for idx in range(0, val_anchor.shape[0], BATCH_SIZE):
            val_anchor_batch, val_positive_batch, val_negative_batch = val_anchor[idx : idx + BATCH_SIZE], val_positive[idx : idx + BATCH_SIZE], val_negative[idx : idx + BATCH_SIZE]
            val_loss_ = sess.run(loss, feed_dict = {caption_placeholder : val_anchor_batch, \
                                            frame_1_placeholder : val_positive_batch, \
                                            frame_2_placeholder : val_negative_batch})
            val_tot_loss += val_loss_
        print("After epoch {} train loss is {:.4f} valid loss is {:.4f}".format(i, tot_loss, val_tot_loss))

HELLLOOOO
After epoch 0 train loss is 62994.8881 valid loss is 35435.9102
HELLLOOOO
After epoch 1 train loss is 28087.8364 valid loss is 28221.2196
HELLLOOOO
After epoch 2 train loss is 11335.5035 valid loss is 23442.9009
HELLLOOOO
After epoch 3 train loss is 5266.5304 valid loss is 21918.4990
HELLLOOOO
After epoch 4 train loss is 2560.8567 valid loss is 20187.3888
HELLLOOOO
After epoch 5 train loss is 1259.7281 valid loss is 19149.5896
HELLLOOOO
After epoch 6 train loss is 940.5610 valid loss is 20363.5976
HELLLOOOO
After epoch 7 train loss is 570.1578 valid loss is 19685.7288
HELLLOOOO
After epoch 8 train loss is 391.7504 valid loss is 21820.4198
HELLLOOOO
After epoch 9 train loss is 410.1410 valid loss is 21913.0977
HELLLOOOO
After epoch 10 train loss is 282.3038 valid loss is 23204.2985
HELLLOOOO
After epoch 11 train loss is 155.1605 valid loss is 24899.2778


In [20]:
train_anchor.shape

(461750, 50, 300)

In [21]:
saver = tf.train.Saver()
saver.save(sess, 'model_triplets_variable')

'model_triplets_variable'

In [22]:
with open(r'/home/login/Paired/val_pair.pickle', 'rb') as f:
    test_pairs_list = pickle.load(f)

In [80]:
# This is the pad pairs function defined in make_triples
def pad_pairs(pair_list):
    # max_frame_step = max([each[1].shape[0] for each in pair_list])
    max_frame_step = 50
    # print(max_frame_step)
    for idx, each in enumerate(pair_list):
        pair_list[idx][1] = np.vstack([each[1], np.zeros((max_frame_step - each[1].shape[0], 500))])

    # print([each[1].shape for each in pair_list])
    # print([t.shape for t in temp])

    # max_caption_step = max([each[2].shape[0] for each in pair_list])
    max_caption_step = 50
    # print(max_caption_step)

    for idx, each in enumerate(pair_list):
        if each[2].shape[0] < 50:
            pair_list[idx][2] = np.vstack([each[2], np.zeros((max_caption_step - each[2].shape[0], 300))])
        else:
            pair_list[idx][2] = each[2][:50, :]

    # print([each[2].shape for each in pair_list])
    return pair_list

test_pairs_list = pad_pairs(test_pairs_list)

In [81]:
test_frames = np.stack([each[1] for each in test_pairs_list], axis = 0)
test_captions = np.stack([each[2] for each in test_pairs_list], axis = 0)

In [82]:
test_captions.shape

(771, 50, 300)

In [83]:
test_size = test_frames.shape[0]

In [84]:
frame_latent_vector = sess.run(frame_out_1, feed_dict = {frame_1_placeholder : test_frames})

In [85]:
caption_latent_vector = sess.run(caption_out, feed_dict = {caption_placeholder : test_captions})

In [86]:
caption_latent_vector.shape

(771, 256)

In [87]:
# Gives the rank of the corresponding clip for the given caption
def get_rank(inp, idx):
    out = [0] * len(inp)
    for i, x in enumerate(sorted(range(len(inp)), key=lambda y: inp[y])):
        out[x] = i
    return out[idx]


In [88]:
# Percentile metric
percentile_list = []
for i, caption_ in enumerate(caption_latent_vector):
    euclidean_norm = np.linalg.norm(frame_latent_vector - caption_, axis=1).tolist() # finding the closest clip to the caption
    percentile = ((test_size - get_rank(euclidean_norm, i))/test_size) * 100 # finding the percentile of the closest
    percentile_list.append(percentile)

In [89]:
# Taking the top 20
top20 = np.mean([1 if each > 80 else 0 for each in percentile_list])

In [90]:
print('The percentage of caption with atleast 20 percentile is {:.4f}'.format(top20))

The percentage of caption with atleast 20 percentile is 0.8416


In [91]:
# Taking the top 10
top10 = np.mean([1 if each > 90 else 0 for each in percentile_list])

In [92]:
print('The percentage of caption with atleast 10 percentile is is {:.4f}'.format(top10))

The percentage of caption with atleast 10 percentile is is 0.5123
