In [1]:
from tensorflow.keras.layers import Input, Softmax, Dense, Dropout
from tensorflow.keras.layers import CuDNNGRU, CuDNNLSTM
from tensorflow.keras.layers import Bidirectional, TimeDistributed, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import numpy as np
import json
import nltk
from nltk import word_tokenize

In [2]:
idfilename = 'training_data/id.txt'
datadirname = 'training_data/feat/'
labelfilename = 'training_label.json'

In [3]:
#load dictionary

with open("DIC_word_index.json") as f:
    DIC_word_index = json.load(f)
    
with open("DIC_index_word.json") as f:
    DIC_index_word = json.load(f)

#DIC_index_word = {}

In [4]:
# TODO: implement Sent2Seq
# hyperparameter: min count > 3 (discard terms with freq <= 3)
def Sent2Seq(sent):
#     print(sent)
    tokens = word_tokenize(sent.lower())
    ret = []
#     print(tokens)
    for word in tokens:
        #print(word)
        ret.append(DIC_word_index[word])
    #input("")
    return ret

In [5]:
from tqdm import tqdm
#add a space for split function easy to handle
BOS = "<bos>"
EOS = "<eos>"
PAD = "<pad>"
UNK = "<unk>"

BOS_index = DIC_word_index[BOS]
EOS_index = DIC_word_index[EOS]

# loading training data

video_feat = {}
video_id = {}
for i,lb in enumerate(open(idfilename)):
    #lb contains '\n', therefore lb[:-1]
    lb = lb[:-1]
    video_feat[lb] = np.load(datadirname + lb + ".npy")
    video_id[lb] = i
    
# TRAIN_SZ = len(encode_x)
# decode_x = [[]]*TRAIN_SZ
# decode_y = [[]]*TRAIN_SZ

sampling = 1
decode_x = []
decode_y = []
encode_x = []
MAX_SEQ_LEN = 10;

# loading decoder data
rawlabels = json.load(open(labelfilename, 'r'))
for data in tqdm(rawlabels):
    vid = data['id']
    index = video_id[vid]
#     print(index)
    for k in range(sampling):
        sent =  data['caption'][k]
        sent_seq = Sent2Seq(sent)
        decode_x.append([BOS_index] + sent_seq)
        decode_y.append(sent_seq + [EOS_index])
        encode_x.append(video_feat[vid])
#         if(len(video_feat[vid]) > MAX_SEQ_LEN):
#             MAX_SEQ_LEN = len(video_feat[vid])

100%|██████████| 1450/1450 [00:00<00:00, 6649.71it/s]


In [6]:
del video_id
del video_feat
del rawlabels

In [7]:
VOCAB_SZ = len(DIC_word_index) # maybe? need statistics

In [8]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
# data1 = pad_sequences(data1, maxlen=MAX_IN_LEN, padding='pre', truncating='pre')

#decode_x will be the same len as decode_y
print(len(decode_x))
print(len(decode_x[0]))
decode_x = pad_sequences(decode_x, maxlen=MAX_SEQ_LEN, padding='post', truncating='post')
decode_y = pad_sequences(decode_y, maxlen=MAX_SEQ_LEN, padding='post', truncating='post')

#print(decode_x)
#print(decode_y)
print("MAX_X_LEN:%d"%(MAX_SEQ_LEN))
# decode_x = decode_x.reshape(decode_x.shape[0],decode_x.shape[1], 1)
decode_y = decode_y.reshape(decode_y.shape[0],decode_y.shape[1], 1)
decode_x = to_categorical(decode_x, num_classes=VOCAB_SZ)
# decode_y = to_categorical(decode_y, num_classes=VOCAB_SZ)
print(decode_x.shape)
print(decode_y.shape)
# print(TRAIN_SZ)
#input("")

1450
8
MAX_X_LEN:10
(1450, 10, 6087)
(1450, 10, 1)


In [9]:
from tensorflow.keras.layers import Input, Softmax, Dense, Dropout
from tensorflow.keras.layers import CuDNNGRU, CuDNNLSTM
from tensorflow.keras.layers import Bidirectional, TimeDistributed, Concatenate, RepeatVector
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

EncoderDIM = 128
DecoderDIM = 128

#decode_x = decode_x[:MAX_SEQ_LEN,:]
#decode_y = decode_y[:MAX_SEQ_LEN,:]

# Layers
t_encoder_input = Input(shape=(80, 4096), name="EncoderInput")
L_encoder = CuDNNGRU(EncoderDIM, name='Encoder')
L_Repeat = RepeatVector(MAX_SEQ_LEN)

t_decoder_input = Input(shape=(MAX_SEQ_LEN,VOCAB_SZ), name="DecoderInput")
L_Concat = Concatenate(axis=-1, name="Interaction")
L_decoder = CuDNNGRU(DecoderDIM, return_sequences=True, name='Decoder')
L_Dense = Dense(VOCAB_SZ, activation='softmax', name="Dense")

# tensors
t_encoder_outputs = L_encoder(t_encoder_input)
t_encoder_outputs = L_Repeat(t_encoder_outputs)

t_decoder_mod_input = L_Concat([t_decoder_input, t_encoder_outputs])

t_decoder_outputs = L_decoder(t_decoder_mod_input)
t_out_probs = TimeDistributed(L_Dense)(t_decoder_outputs)


model = Model(inputs=[t_encoder_input, t_decoder_input], outputs=t_out_probs)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
EncoderInput (InputLayer)       (None, 80, 4096)     0                                            
__________________________________________________________________________________________________
Encoder (CuDNNGRU)              (None, 128)          1622784     EncoderInput[0][0]               
__________________________________________________________________________________________________
DecoderInput (InputLayer)       (None, 10, 6087)     0                                            
__________________________________________________________________________________________________
repeat_vector (RepeatVector)    (None, 10, 128)      0           Encoder[0][0]                    
__________________________________________________________________________________________________
Interactio

In [None]:
from tensorflow.keras.callbacks import EarlyStopping,ReduceLROnPlateau
optimizer = Adam(lr=1e-3)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
#  earlystp = EarlyStopping(monitor="loss", patience=20, verbose=1, mode="auto")
# checkpoint = ModelCheckpoint(model_name+'_{epoch:02d}.hdf5', monitor='val_loss', \
#                              verbose=0, save_best_only=True, save_weights_only=False, \
#                              mode='auto', period=1)
lrreduc = ReduceLROnPlateau(monitor='loss', factor=0.5,\
                             patience=5, min_lr=0.00001, verbose=1, cooldown=5)

model.fit(x=[encode_x, decode_x], y=decode_y, batch_size=4, epochs=200, callbacks=[lrreduc])
mdname = 'modelv3.h5'
model.save_weights(mdname)

In [None]:
mdname = 'modelv3.h5'
model.load_weights(mdname)

In [None]:
from tensorflow.keras.layers import Input, Softmax, Dense, Dropout
from tensorflow.keras.layers import CuDNNGRU, CuDNNLSTM
from tensorflow.keras.layers import Bidirectional, TimeDistributed, Concatenate, RepeatVector
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [12]:
EncoderDIM = 128
DecoderDIM = 128
mdname = 'modelv3.h5'

# Encoder model
# Layers
t_encoder_input = Input(shape=(80, 4096), name="EncoderInput")
L_encoder = CuDNNGRU(EncoderDIM, name='Encoder')
# tensors
t_encoder_outputs = L_encoder(t_encoder_input)
emodel = Model(inputs=t_encoder_input, outputs=t_encoder_outputs)
emodel.summary()
emodel.load_weights(mdname, by_name=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
EncoderInput (InputLayer)    (None, 80, 4096)          0         
_________________________________________________________________
Encoder (CuDNNGRU)           (None, 128)               1622784   
Total params: 1,622,784
Trainable params: 1,622,784
Non-trainable params: 0
_________________________________________________________________


In [13]:
# Decoder model
# Layers
t_decoder_input = Input(shape=(1,VOCAB_SZ), name="DecoderInput")
t_decoder_enc_input = Input(shape=(1,EncoderDIM), name="DecoderEncodedInput")
t_decoder_state_input = Input(shape=(DecoderDIM,), name="DecoderStateInput")

L_Concat = Concatenate(axis=-1, name="Interaction")
L_decoder = CuDNNGRU(DecoderDIM, return_state=True, name='Decoder')
# L_Dense = Dense(VOCAB_SZ, activation='softmax', name="Dense")

# tensors
t_decoder_mod_input = L_Concat([t_decoder_input, t_decoder_enc_input])

t_decoder_outputs, t_state = L_decoder(t_decoder_mod_input, initial_state=t_decoder_state_input)
t_out_probs = L_Dense(t_decoder_outputs)


dmodel = Model(inputs=[t_decoder_input, t_decoder_enc_input, t_decoder_state_input], outputs=[t_out_probs, t_state])
dmodel.summary()
dmodel.load_weights(mdname, by_name=True)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
DecoderInput (InputLayer)       (None, 1, 6087)      0                                            
__________________________________________________________________________________________________
DecoderEncodedInput (InputLayer (None, 1, 128)       0                                            
__________________________________________________________________________________________________
Interaction (Concatenate)       (None, 1, 6215)      0           DecoderInput[0][0]               
                                                                 DecoderEncodedInput[0][0]        
__________________________________________________________________________________________________
DecoderStateInput (InputLayer)  (None, 128)          0                                            
__________

In [14]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

BOS = '<bos>'
EOS = '<eos>'

def decode_sequence(input_seq):
    # Encode the input as state vectors.
    encoded_value = emodel.predict(input_seq)
#     encoded_value = np.expand_dims(encoded_value, axis=1)
    
    feed_seq = DIC_word_index[BOS]
    feed_state = np.zeros((1,DecoderDIM))

    stop_condition = False
    decoded_sentence = []
    t = 0
    while not stop_condition:
        feed_seq_cat = to_categorical([feed_seq], num_classes=VOCAB_SZ)
        output_token, h = dmodel.predict([[feed_seq_cat], [encoded_value], feed_state])
#         output_token, h = dmodel.predict([[feed_seq_cat], encoded_value, feed_state])

        # Sample a token
        sampled_token_index = np.argmax(output_token[0, :])
        sampled_word = DIC_index_word[str(sampled_token_index)]
        decoded_sentence.append(sampled_word)
        print(sampled_word)

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_word == EOS or len(decoded_sentence) >= MAX_SEQ_LEN):
            stop_condition = True
        else:
            # Update the target sequence (of length 1).
            t += 1
            feed_seq = sampled_token_index

            # Update states
            feed_state = h

    return decoded_sentence

In [15]:
indexx = 32
test = encode_x[indexx].reshape(1, encode_x[indexx].shape[0],encode_x[indexx].shape[1])
decode_sequence(test)
print(len(test))

a
man
is
playing
the
guitar
.
<eos>
1
