In [2]:
from tensorflow.keras.layers import Input, Softmax, Dense
from tensorflow.keras.layers import CuDNNGRU, CuDNNLSTM
from tensorflow.keras.layers import Bidirectional, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import numpy as np
import json

In [3]:
idfilename = 'training_data/id.txt'
datadirname = 'training_data/feat/'
labelfilename = 'training_label.json'

In [None]:
# TODO: implement Sent2Seq
# hyperparameter: min count > 3 (discard terms with freq <= 3)
def Sent2Seq(sent):
    return [0] * len(sent)

In [None]:


BOS = "<bos>"
EOS = "<eos>"

# loading training data
encode_x = []
video_id = {}
for i,lb in enumerate(open(idfilename)):
    x = np.load(datadirname + lb)
    encode_x.append(x)
    video_id[lb] = i
    
TRAIN_SZ = len(encode_x)
decode_x = [[]]*TRAIN_SZ
decode_y = [[]]*TRAIN_SZ

# loading decoder data
rawlabels = json.load(open(labelfilename, 'r'))
for data in rawlabels:
    index = video_id[data['id']]
    sent =  data['caption'][0] # select one sentence for now
    # TODO: implement Sent2Seq
    decode_x[index] = Sent2Seq(BOS+sent)
    decode_y[index] = Sent2Seq(sent+EOS)

In [None]:
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# data1 = pad_sequences(data1, maxlen=MAX_IN_LEN, padding='pre', truncating='pre')

In [4]:
VOCAB_SZ = 10000 # maybe? need statistics

In [7]:
# Using sparse_categorical_crossentropy, we only need to pass integers as input to decoder.
MAX_SEQ_LEN = 50
EncoderDIM = 256
DecoderDIM = 256

# Layers
t_encoder_input = Input(shape=(80, 4096), name="EncoderInput")
t_decoder_input = Input(shape=(MAX_SEQ_LEN, VOCAB_SZ), name="DecoderInput")
L_encoder = CuDNNGRU(EncoderDIM, return_state=True, name='Encoder')
L_decoder = CuDNNGRU(DecoderDIM, return_sequences=True, name='Decoder')
L_Dense = Dense(VOCAB_SZ, name="Dense", activation='softmax')
# L_SM = Softmax(axis=-1, name="Softmax")

# tensors
t_encoder_outputs, state_h = L_encoder(t_encoder_input)
t_decoder_outputs = L_decoder(t_decoder_input, initial_state=state_h)
t_out_probs = TimeDistributed(L_Dense)(t_decoder_outputs)


model = Model(inputs=[t_encoder_input, t_decoder_input], outputs=t_out_probs)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
EncoderInput (InputLayer)       (None, 80, 4096)     0                                            
__________________________________________________________________________________________________
DecoderInput (InputLayer)       (None, 50, 10000)    0                                            
__________________________________________________________________________________________________
Encoder (CuDNNGRU)              [(None, 256), (None, 3343872     EncoderInput[0][0]               
__________________________________________________________________________________________________
Decoder (CuDNNGRU)              (None, 50, 256)      7878144     DecoderInput[0][0]               
                                                                 Encoder[0][1]                    
__________

In [None]:
optimizer = Adam(lr=1e-3)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.fit(x=[encode_x, decode_x], y=decode_y, batch_size=128, epochs=200) #callbacks=[]