In [1]:
from tensorflow.keras.layers import Input, Softmax, Dense, Dropout
from tensorflow.keras.layers import CuDNNGRU, CuDNNLSTM
from tensorflow.keras.layers import Bidirectional, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import numpy as np
import json
import re

import nltk
from nltk import word_tokenize

In [2]:
idfilename = 'training_data/id.txt'
datadirname = 'training_data/feat/'
labelfilename = 'training_label.json'

with open("DIC_word_index.json") as f:
    DIC_word_index = json.load(f)
    
with open("DIC_index_word.json") as f:
    DIC_index_word = json.load(f)

In [3]:
# loading training data
encode_x = []
video_id = {}
for i,video_name in enumerate(open(idfilename)):
    video_name = video_name[:-1]
    x = np.load(datadirname + video_name + ".npy")
    encode_x.append(x)
    video_id[video_name] = i

encode_x = np.array(encode_x)
    
TRAIN_SZ = len(encode_x)
decode_x = [[]]*TRAIN_SZ
decode_y = [[]]*TRAIN_SZ

In [4]:
def Sent2Seq(sent):
    #print(sent)    
    tokens = word_tokenize(sent.lower())
    ret = []
    for word in tokens:
        ret.append(DIC_word_index[word])
    return ret

BOS = "<bos>" # index is 1
EOS = "<eos>" # index is 2

VOCAB_SZ = len(DIC_word_index)
MAX_SEQ_LEN = 0;
# loading decoder data
rawlabels = json.load(open(labelfilename, 'r'))
for data in rawlabels:    
    index = video_id[data['id']]
    #print(index)
    sent =  data['caption'][0] # select one sentence for now
    # TODO: implement Sent2Seq
    decode_x[index] = [1] + Sent2Seq(sent)
    decode_y[index] = Sent2Seq(sent) + [2]
    if(len(decode_x[index]) > MAX_SEQ_LEN):
        MAX_SEQ_LEN = len(decode_x[index])


In [5]:
print(decode_x[0])
print(decode_y[0]) #one shift from decode_x

[1, 4, 5, 6, 7, 4, 8, 9]
[4, 5, 6, 7, 4, 8, 9, 2]


In [6]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

decode_x = pad_sequences(decode_x, maxlen=MAX_SEQ_LEN, padding='post', truncating='pre')
decode_y = pad_sequences(decode_y, maxlen=MAX_SEQ_LEN, padding='post', truncating='pre')
decode_y = to_categorical(decode_y, num_classes=VOCAB_SZ)
decode_x = to_categorical(decode_x, num_classes=VOCAB_SZ)


In [7]:
print(encode_x.shape)
print(decode_x.shape)
print(decode_y.shape)

(1450, 80, 4096)
(1450, 45, 6087)
(1450, 45, 6087)


In [9]:
latent_dim = 256
#=================encoder====================#
encoder_inputs = Input(shape=(80, 4096))
encoder = CuDNNLSTM(latent_dim, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]
#=================decoder====================#
decoder_inputs = Input(shape=(None,VOCAB_SZ))
decoder_lstm = CuDNNLSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs,_ , _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)

decoder_dense = Dense(VOCAB_SZ, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
#=============================================
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)


In [10]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 80, 4096)     0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, None, 6087)   0                                            
__________________________________________________________________________________________________
cu_dnnlstm_3 (CuDNNLSTM)        [(None, 80, 256), (N 4458496     input_3[0][0]                    
__________________________________________________________________________________________________
cu_dnnlstm_4 (CuDNNLSTM)        [(None, None, 256),  6497280     input_4[0][0]                    
                                                                 cu_dnnlstm_3[0][1]               
          

In [None]:
optimizer = Adam(lr=0.001)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=["accuracy"])
model.fit([encode_x, decode_x], decode_y, validation_split=0.1, batch_size=1, epochs=50)

Train on 1305 samples, validate on 145 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50

In [None]:
#==============inference setup===================#
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [None]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty decode_y seq
    # y_seq shape : (1, 1, 6087)
    y_seq = np.zeros((1, 1, VOCAB_SZ))
    y_seq[0, 0, DIC_word_index["<bos>"]] = 1
    
    stop_condition = False
    decoded_sentence = []
    while not stop_condition:
        #output_tokens shape : (1, 1, 6087)
        #output_tokens[0, -1, :] shape : (6087, )
        output_tokens, h, c = decoder_model.predict(  
            [y_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :] )
        sampled_token = DIC_index_word[str(sampled_token_index)]
        if(sampled_token!='<eos>'):
            decoded_sentence.append(sampled_token)
        
        #Exit Condition :either hit max length or find stop char
        
        if(sampled_token == '<eos>' or
          len(decoded_sentence) > MAX_SEQ_LEN):
            stop_condition = True
        
        #Update y_seq
        y_seq = np.zeros((1, 1, VOCAB_SZ))
        y_seq[0, 0, sampled_token_index] = 1
        
        #Update states
        states_value = [h, c]
        
    return decoded_sentence

In [None]:
idfilename = 'testing_data/id.txt'
datadirname = 'testing_data/feat/'
labelfilename = 'testing_label.json'

encode_x = []
video_id = []
for i,lb in enumerate(open(idfilename)):
    lb = lb[:-1]
    encode_x.append(np.load(datadirname + lb +".npy"))
    video_id.append(lb)
    
out_labels = []
for indexx in range(len(encode_x)):
    sent = decode_sequence(np.array([encode_x[indexx]]))
    sent = " ".join(sent)
    print(sent)
    out_labels.append(sent)

In [None]:
with open('MODELTEST_testing.txt', 'w') as f:
    for i in range(len(encode_x)):
        f.write(video_id[i] + ',' + out_labels[i] + '\n')

In [None]:
#====================TESTING===================#

idfilename_t = 'testing_data/id.txt'
datadirname_t = 'testing_data/feat/'

# loading testing data
encode_x_t = []
video_id_t = {}
for i,video_name in enumerate(open(idfilename_t)):
    #lb contains '\n', therefore lb[:-1]
    video_name = video_name[:-1]
    x = np.load(datadirname_t + video_name + ".npy")
    encode_x_t.append(x)
    video_id_t[video_name] = i

In [None]:
#============predict the caption============#
OUTPUTS = []
for X in encode_x_t:
    X = np.array([X])
    Y = decode_sequence(X)
    OUTPUTS.append(Y)

In [None]:
#============to json============#
predict_label = []
with open('predict_label.txt', 'w') as f:
    for video_name, _id in video_id_t.items():
        tokens = OUTPUTS[int(_id)][:-2]
        predict = " ".join(tokens)
        predict +="."

        f.write(str(video_name) + "," + predict +"\n")
    

In [None]:
video_id_t