In [None]:
from tensorflow.keras.layers import Input, Softmax, Dense, Dropout
from tensorflow.keras.layers import CuDNNGRU, CuDNNLSTM
from tensorflow.keras.layers import Bidirectional, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import numpy as np
import json
import re

In [None]:
idfilename = 'training_data/id.txt'
datadirname = 'training_data/feat/'
labelfilename = 'training_label.json'

In [None]:
#load dictionary

with open("DIC_word_index.json") as f:
    DIC_word_index = json.load(f)
    
with open("DIC_index_word.json") as f:
    DIC_index_word = json.load(f)

#DIC_index_word = {}

In [None]:
# TODO: implement Sent2Seq
# hyperparameter: min count > 3 (discard terms with freq <= 3)
def Sent2Seq(sent):
    print(sent)
    sent = re.sub(r'[^\w\s\<\>\-]','',sent)
    tokens = sent.lower().split()
    ret = []
    print(tokens)
    for word in tokens:
        #print(word)
        ret.append(DIC_word_index[word])
    #input("")
    return ret

In [None]:

#add a space for split function easy to handle
BOS = "<bos> "
EOS = " <eos>"

# loading training data
encode_x = []
video_id = {}
for i,lb in enumerate(open(idfilename)):
    #lb contains '\n', therefore lb[:-1]
    lb = lb[:-1]
    x = np.load(datadirname + lb + ".npy")
    encode_x.append(x)
    video_id[lb] = i
    
TRAIN_SZ = len(encode_x)
decode_x = [[]]*TRAIN_SZ
decode_y = [[]]*TRAIN_SZ

MAX_SEQ_LEN = 0;

# loading decoder data
rawlabels = json.load(open(labelfilename, 'r'))
for data in rawlabels:
    
    index = video_id[data['id']]
    print(index)
    sent =  data['caption'][0] # select one sentence for now
    # TODO: implement Sent2Seq
    decode_x[index] = Sent2Seq(BOS+sent)
    decode_y[index] = Sent2Seq(sent+EOS)
    if(len(decode_x[index]) > MAX_SEQ_LEN):
        MAX_SEQ_LEN = len(decode_x[index])


In [None]:
VOCAB_SZ = len(DIC_word_index) # maybe? need statistics

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
# data1 = pad_sequences(data1, maxlen=MAX_IN_LEN, padding='pre', truncating='pre')

#decode_x will be the same len as decode_y
print(len(decode_x))
print(len(decode_x[0]))
decode_x = pad_sequences(decode_x, maxlen=MAX_SEQ_LEN, padding='pre', truncating='pre')
decode_y = pad_sequences(decode_y, maxlen=MAX_SEQ_LEN, padding='pre', truncating='pre')

#print(decode_x)
#print(decode_y)
print("MAX_X_LEN:%d"%(MAX_SEQ_LEN))
# decode_x = decode_x.reshape(decode_x.shape[0],decode_x.shape[1], 1)
decode_y = decode_y.reshape(decode_y.shape[0],decode_y.shape[1], 1)
decode_x = to_categorical(decode_x, num_classes=VOCAB_SZ)
# decode_y = to_categorical(decode_y, num_classes=VOCAB_SZ)
print(decode_x.shape)
print(decode_y.shape)
print(TRAIN_SZ)
#input("")

In [None]:
# Using sparse_categorical_crossentropy, we only need to pass integers as input to decoder.
EncoderDIM = 256
DecoderDIM = 256

#decode_x = decode_x[:MAX_SEQ_LEN,:]
#decode_y = decode_y[:MAX_SEQ_LEN,:]

# Layers
t_encoder_input = Input(shape=(80, 4096), name="EncoderInput")
t_decoder_input = Input(shape=(MAX_SEQ_LEN,VOCAB_SZ), name="DecoderInput")
L_encoder = CuDNNGRU(EncoderDIM, return_state=True, name='Encoder')
L_decoder = CuDNNGRU(DecoderDIM, return_sequences=True, return_state=True, name='Decoder')
L_Dense = Dense(VOCAB_SZ, name="Dense", activation='softmax')

# tensors
t_encoder_outputs, state_h = L_encoder(t_encoder_input)
t_decoder_outputs, _ = L_decoder(t_decoder_input, initial_state=state_h)
t_out_probs = TimeDistributed(L_Dense)(t_decoder_outputs)


model = Model(inputs=[t_encoder_input, t_decoder_input], outputs=t_out_probs)
model.summary()

In [None]:

optimizer = Adam(lr=1e-3)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
earlystp = EarlyStopping(monitor="loss", patience=20, verbose=1, mode="auto")
checkpoint = ModelCheckpoint(model_name+'_{epoch:02d}.hdf5', monitor='val_loss', \
                             verbose=0, save_best_only=True, save_weights_only=False, \
                             mode='auto', period=1)
lrreduc = ReduceLROnPlateau(monitor='loss', factor=0.5,
                              patience=5, min_lr=0.00001, verbose=1, cooldown=5)

model.fit(x=[encode_x, decode_x], y=decode_y, batch_size=1, epochs=200) #callbacks=[]

In [None]:
model.save_weights('modelv1.h5')

In [None]:
EncoderDIM = 128
DecoderDIM = 128

# Encoder model
# Layers
t_encoder_input = Input(shape=(80, 4096), name="EncoderInput")
L_encoder = CuDNNGRU(EncoderDIM, return_state=True, name='Encoder')
# tensors
t_encoder_outputs, state_h = L_encoder(t_encoder_input)
emodel = Model(inputs=t_encoder_input, outputs=state_h)
emodel.summary()
emodel.load_weights('modelv1.h5', by_name=True)


# Decoder model
# Layers
t_decoder_input = Input(shape=(MAX_SEQ_LEN,VOCAB_SZ), name="DecoderInput")
t_decoder_state_input = Input(shape=(DecoderDIM,), name="DecoderStateInput")
L_decoder = CuDNNGRU(DecoderDIM, return_sequences=True, return_state=True, name='Decoder')
L_Dense = Dense(VOCAB_SZ, name="Dense", activation='softmax')

# tensors
t_decoder_outputs, t_decode_state = L_decoder(t_decoder_input, initial_state=t_decoder_state_input)
t_out_probs = TimeDistributed(L_Dense)(t_decoder_outputs)


dmodel = Model(inputs=[t_decoder_input, t_decoder_state_input], outputs=[t_out_probs, t_decode_state])
dmodel.summary()
dmodel.load_weights('modelv1.h5', by_name=True)

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

BOS = '<bos>'
EOS = '<eos>'

def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = emodel.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = pad_sequences([[ DIC_word_index[BOS] ]], maxlen=MAX_SEQ_LEN, padding='post', truncating='post')
    

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = []
    t = 0
    while not stop_condition:
        target_seq_cat = to_categorical(target_seq, num_classes=VOCAB_SZ)

        output_tokens, h = dmodel.predict([target_seq_cat, states_value])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
#         sampled_token_index = np.argmax(output_tokens[0, 0, :])
        sampled_word = DIC_index_word[str(sampled_token_index)]
        decoded_sentence.append(sampled_word)
        print(decoded_sentence)

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_word == EOS or len(decoded_sentence) >= MAX_SEQ_LEN):
            stop_condition = True
        else:
            # Update the target sequence (of length 1).
            t += 1
            target_seq[0][t] = sampled_token_index

            # Update states
            states_value = h

    return decoded_sentence

In [None]:
test = encode_x[0].reshape(1, encode_x[0].shape[0],encode_x[0].shape[1])
decode_sequence(test)

In [None]:
i = DIC_word_index['amanplaysaguitar']
DIC_index_word[str(i)]