In [None]:
import os
import pickle
import json
import numpy as np
from keras.layers import TextVectorization
from keras.models import Model, load_model
from keras.layers import Input, LSTM, Dense

# The path where models are located:
models_path = "MSVD16_1500"
# The path where test features are located:
test_feat_path = "MSVD_FEATURES/feats_MSVD16/Test_feat/"
# The path where resulting json file is saved
result_path = "MSVD16_1500"
num_of_frames = 16
num_dec_tokens = 1500
latent_dim = 512

def load_models():
    # Loading Encoder, Decoder and Vectorizer models from the part 2: Training
    enc_model = load_model(os.path.join(models_path, 'enc_model53.h5'))
    enc_model.summary()

    dec_inputs = Input(shape=(None, num_dec_tokens))
    dec_dense = Dense(num_dec_tokens, activation='softmax')
    dec_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
    dec_state_input_h = Input(shape=(latent_dim,))
    dec_state_input_c = Input(shape=(latent_dim,))
    dec_states_inputs = [dec_state_input_h, dec_state_input_c]
    dec_outputs, state_h, state_c = dec_lstm(dec_inputs, initial_state=dec_states_inputs)
    dec_states = [state_h, state_c]
    dec_outputs = dec_dense(dec_outputs)
    dec_model = Model([dec_inputs] + dec_states_inputs, [dec_outputs] + dec_states)
    dec_model.load_weights(os.path.join(models_path, 'dec_model_weights53.h5'))
    dec_model.summary()

    with open(os.path.join(models_path, 'vectorizer' + str(num_dec_tokens) + ".pickle"), "rb") as file:
        vectorizer_data = pickle.load(file)
    vectorizer = TextVectorization.from_config(vectorizer_data["vec_config"])
    vectorizer.set_weights(vectorizer_data["vec_weights"])

    return enc_model, dec_model, vectorizer

def greedy(feat):
    vocab_detect = {} # a dict with ineteger as keys and vocabulary words as values
    for i, word in enumerate(vectorizer.get_vocabulary()):
        vocab_detect[i] = word
    states_h_c = enc_model.predict(feat.reshape(-1, num_of_frames, 4096))
    next_word = np.zeros((1, 1, num_dec_tokens)) # detects a next word in caption
    caption = [] # a caption for given video feature
    next_word[0, 0, vectorizer.get_vocabulary().index('<bos>')] = 1
    for i in range(20):
        out_words, h, c = dec_model.predict([next_word] + states_h_c)
        out_words = out_words.reshape(num_dec_tokens)
        states_h_c = [h, c]
        word_ind = np.argmax(out_words) # index of word in vocabulary
        if vocab_detect[word_ind] == None:
            break
        elif word_ind == 0:
            continue
        else:
            caption.append(vocab_detect[word_ind])
            next_word = np.zeros((1, 1, num_dec_tokens)) # refreshing to detect a next word
            next_word[0, 0, word_ind] = 1
    caption.pop() # removing <eos> token
    res_caption = ' '.join(caption) # resulting caption
    return res_caption

##### Loading models:
enc_model, dec_model, vectorizer = load_models()

test_featID = [] # feature and video id of test features
for filename in os.listdir(test_feat_path):
    feat_test = np.load(os.path.join(test_feat_path, filename), allow_pickle=True)
    test_featID.append([feat_test, filename[:-8]])

caps_list = []
for featID in test_featID:
    cap_dict = {}
    cap_dict['video_id'] = featID[1]
    # Generating a caption using greedy search algorithm:
    out_caption = greedy(featID[0])
    cap_dict['caption'] = out_caption
    caps_list.append(cap_dict)

# Saving in json file:
caps_json = json.dumps(caps_list, indent = 4)
with open(os.path.join(result_path, 'MSVD16_1500_test_generated_caps.json'), 'w') as caps_file:
    caps_file.write(caps_json)



Model: "model_63"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 enc_inputs (InputLayer)     [(None, 16, 4096)]        0         
                                                                 
 enc_lstm (LSTM)             [(None, 16, 512),         9439232   
                              (None, 512),                       
                              (None, 512)]                       
                                                                 
Total params: 9439232 (36.01 MB)
Trainable params: 9439232 (36.01 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None, 1500)]         0        



















































































































































































