In [1]:
from tensorflow.keras.layers import Input, Softmax, Dense
from tensorflow.keras.layers import CuDNNGRU, CuDNNLSTM
from tensorflow.keras.layers import Bidirectional, TimeDistributed, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import numpy as np
import json
import nltk
from nltk import word_tokenize

In [2]:
idfilename = 'training_data/id.txt'
datadirname = 'training_data/feat/'
labelfilename = 'training_label.json'

In [3]:
#load dictionary

with open("DIC_word_index.json") as f:
    DIC_word_index = json.load(f)
    
with open("DIC_index_word.json") as f:
    DIC_index_word = json.load(f)

#DIC_index_word = {}

In [4]:
# TODO: implement Sent2Seq
# hyperparameter: min count > 3 (discard terms with freq <= 3)
def Sent2Seq(sent):
    #print(sent)
    tokens = word_tokenize(sent.lower())
    ret = []
    #print(tokens)
    for word in tokens:
        #print(word)
        ret.append(DIC_word_index[word])
    #input("")
    return ret

In [5]:

BOS = "<bos>"
EOS = "<eos>"
PAD = "<pad>"
UNK = "<unk>"

BOS_index = DIC_word_index[BOS]
EOS_index = DIC_word_index[EOS]

# loading training data
encode_x = []
video_id = {}

for i,lb in enumerate(open(idfilename)):
    #lb contains '\n', therefore lb[:-1]
    lb = lb[:-1]
    video_id[lb] = i

TRAIN_SZ = len(video_id)
decode_x = [[]]*TRAIN_SZ
decode_y = [[]]*TRAIN_SZ

MAX_SEQ_LEN = 0;

pad = [0] * 80



# loading decoder data
rawlabels = json.load(open(labelfilename, 'r'))
for data in rawlabels:
    
    index = video_id[data['id']]
    #print(index)
    sent =  data['caption'][0] # select one sentence for now
    # TODO: implement Sent2Seq

    decode_x[index] = pad + [BOS_index] + Sent2Seq(sent)
    decode_y[index] = pad + Sent2Seq(sent) + [EOS_index]
    if(len(decode_x[index]) > MAX_SEQ_LEN):
        MAX_SEQ_LEN = len(decode_x[index])


In [6]:
VOCAB_SZ = len(DIC_word_index) # maybe? need statistics

In [7]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
# data1 = pad_sequences(data1, maxlen=MAX_IN_LEN, padding='pre', truncating='pre')

#decode_x will be the same len as decode_y
print(len(decode_x))
print(len(decode_x[0]))
decode_x = pad_sequences(decode_x, maxlen=MAX_SEQ_LEN+80, padding='post', truncating='pre')
decode_y = pad_sequences(decode_y, maxlen=MAX_SEQ_LEN+80, padding='post', truncating='pre')

#print(decode_x)
#print(decode_y)
print("MAX_X_LEN:%d"%(MAX_SEQ_LEN))
# decode_x = decode_x.reshape(decode_x.shape[0],decode_x.shape[1], 1)
decode_y = decode_y.reshape(decode_y.shape[0],decode_y.shape[1], 1)
decode_x = to_categorical(decode_x, num_classes=VOCAB_SZ)
# decode_y = to_categorical(decode_y, num_classes=VOCAB_SZ)
print(decode_x.shape)
print(decode_y.shape)
print(TRAIN_SZ)
#input("")

1450
88
MAX_X_LEN:125
(1450, 205, 6087)
(1450, 205, 1)
1450


In [8]:
# Using sparse_categorical_crossentropy, we only need to pass integers as input to decoder.
EncoderDIM = 128
DecoderDIM = 128

#decode_x = decode_x[:MAX_SEQ_LEN,:]
#decode_y = decode_y[:MAX_SEQ_LEN,:]

# Layers
t_encoder_input = Input(shape=(80 + MAX_SEQ_LEN, 4096), name="EncoderInput")
t_decoder_input = Input(shape=(80 + MAX_SEQ_LEN, VOCAB_SZ), name="DecoderInput")
L_encoder = CuDNNGRU(EncoderDIM, return_sequences=True, name='Encoder')
L_decoder = CuDNNGRU(DecoderDIM, return_sequences=True, name='Decoder')
L_Dense = Dense(VOCAB_SZ, name="Dense", activation='softmax')
# L_SM = Softmax(axis=-1, name="Softmax")
L_Concat = Concatenate(axis = -1, name='Concatenate')

# tensors
t_encoder_outputs = L_encoder(t_encoder_input)
t_concat_decoder_input = L_Concat([t_encoder_outputs, t_decoder_input] )
t_decoder_outputs = L_decoder(t_concat_decoder_input)
t_out_probs = TimeDistributed(L_Dense)(t_decoder_outputs)



model = Model(inputs=[t_encoder_input, t_decoder_input], outputs=t_out_probs)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
EncoderInput (InputLayer)       (None, 205, 4096)    0                                            
__________________________________________________________________________________________________
Encoder (CuDNNGRU)              (None, 205, 128)     1622784     EncoderInput[0][0]               
__________________________________________________________________________________________________
DecoderInput (InputLayer)       (None, 205, 6087)    0                                            
__________________________________________________________________________________________________
Concatenate (Concatenate)       (None, 205, 6215)    0           Encoder[0][0]                    
                                                                 DecoderInput[0][0]               
__________

In [9]:
temp = [0] * 4096
for i,lb in enumerate(open(idfilename)):
    #lb contains '\n', therefore lb[:-1]
    lb = lb[:-1]
    x = np.load(datadirname + lb + ".npy")
    x = x.tolist()
    for j in range(MAX_SEQ_LEN):
        x.append(temp)
    #print(x)
    x = np.array(x)
    #print(x.shape)
    #input("")
    encode_x.append(x)
    

In [10]:

optimizer = Adam(lr=1e-3)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(x=[encode_x, decode_x], y=decode_y, batch_size=1, epochs=200) #callbacks=[]

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
  77/1450 [>.............................] - ETA: 18s - loss: 0.0590 - acc: 0.9867

KeyboardInterrupt: 

In [11]:
MODEL_abc = 'modelv2.h5'
model.save_weights(MODEL_abc)

In [12]:
# Layers
t_encoder_input = Input(shape=(80 + MAX_SEQ_LEN, 4096), name="EncoderInput")
t_decoder_hidden_input = Input(shape=(80 + MAX_SEQ_LEN, EncoderDIM), name="DecoderInput")
t_decoder_input = Input(shape=(80 + MAX_SEQ_LEN, VOCAB_SZ), name="DecoderInput2")
t_decoder_state_input = Input(shape=(DecoderDIM,), name="DecoderInput3")

L_encoder = CuDNNGRU(EncoderDIM, return_sequences=True, name='Encoder')
L_decoder = CuDNNGRU(DecoderDIM, return_sequences=True, return_state= True, name='Decoder')
L_Dense = Dense(VOCAB_SZ, name="Dense", activation='softmax')
# L_SM = Softmax(axis=-1, name="Softmax")
L_Concat = Concatenate(axis = -1, name='Concatenate')

# tensors
t_encoder_outputs = L_encoder(t_encoder_input)

t_concat_decoder_input = L_Concat([t_decoder_hidden_input, t_decoder_input] )
t_decoder_outputs, h_state = L_decoder(t_concat_decoder_input, initial_state = t_decoder_state_input)
t_out_probs = TimeDistributed(L_Dense)(t_decoder_outputs)

emodel = Model(inputs=t_encoder_input, outputs=t_encoder_outputs)
dmodel = Model(inputs = [t_decoder_hidden_input, t_decoder_input, t_decoder_state_input], outputs = [t_out_probs,h_state])

#emodel.compile()
#dmodel.compile()

emodel.summary()
dmodel.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
EncoderInput (InputLayer)    (None, 205, 4096)         0         
_________________________________________________________________
Encoder (CuDNNGRU)           (None, 205, 128)          1622784   
Total params: 1,622,784
Trainable params: 1,622,784
Non-trainable params: 0
_________________________________________________________________
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
DecoderInput (InputLayer)       (None, 205, 128)     0                                            
__________________________________________________________________________________________________
DecoderInput2 (InputLayer)      (None, 205, 6087)    0                                            
_________________________________

In [13]:
emodel.load_weights(MODEL_abc, by_name = True)
dmodel.load_weights(MODEL_abc, by_name = True)

#optimizer = Adam(lr=1e-3)

#emodel.compile()
#odel.compile()

In [14]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

BOS = '<bos>'
EOS = '<eos>'

def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = emodel.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = pad_sequences([[ DIC_word_index[BOS] ]], maxlen=81, padding='pre')
    
    target_seq = pad_sequences(target_seq, maxlen=MAX_SEQ_LEN + 80, padding='post')
    

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = []
    t = 0
    new_state = np.zeros((1,DecoderDIM));
    #print(new_state.shape)
    while not stop_condition:
        target_seq_cat = to_categorical(target_seq, num_classes=VOCAB_SZ)
        #print(new_state.shape)
        output_tokens, h = dmodel.predict([ states_value, target_seq_cat, new_state])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, 80+t, :])
#         sampled_token_index = np.argmax(output_tokens[0, 0, :])
        sampled_word = DIC_index_word[str(sampled_token_index)]
        decoded_sentence.append(sampled_word)
        #print(decoded_sentence)

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_word == EOS or len(decoded_sentence) >= MAX_SEQ_LEN):
            stop_condition = True
        else:
            # Update the target sequence (of length 1).
            
            target_seq[0][80 + t] = sampled_token_index
            t += 1
            # Update states
            new_state = h;

    return decoded_sentence

In [15]:
index = 3
test = encode_x[index].reshape(1, encode_x[index].shape[0],encode_x[index].shape[1])
ret = decode_sequence(test)
print(ret)
print(decode_x[index])
print(len(test))

['drivong', 'triforce', 'drier', 'traveled', 'sweep', 'plaughing', 'seasonings', 'attempting', 'lory', 'gose', 'spilt', 'things', 'been', 'cheesecloth', 'register', 'cheesecloth', 'cheesecloth', 'cheesecloth', 'cheesecloth', 'cheesecloth', 'cheesecloth', 'dogsleds', 'register', 'cheesecloth', 'cheesecloth', 'height', 'sings', 'displaying', 'register', 'cheesecloth', 'cheesecloth', 'feeds', 'cheesecloth', 'worked', 'register', 'register', 'cheesecloth', 'cheesecloth', 'cheesecloth', 'register', 'cheesecloth', 'cheesecloth', 'cheesecloth', 'cheesecloth', 'cheesecloth', 'cheesecloth', 'cheesecloth', 'cheesecloth', 'cheesecloth', 'cheesecloth', 'cheesecloth', 'register', 'cheesecloth', 'register', 'cheesecloth', 'register', 'height', 'register', 'height', 'register', 'sings', 'register', 'sings', 'register', 'sings', 'register', 'cheesecloth', 'cheesecloth', 'cheesecloth', 'cheesecloth', 'cheesecloth', 'cheesecloth', 'register', 'cheesecloth', 'register', 'cheesecloth', 'register', 'height