In [1]:
from tensorflow.keras.layers import Input, Softmax, Dense, Dropout
from tensorflow.keras.layers import CuDNNGRU, CuDNNLSTM
from tensorflow.keras.layers import Bidirectional, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import numpy as np
import json
import nltk
from nltk import word_tokenize

In [2]:
idfilename = 'training_data/id.txt'
datadirname = 'training_data/feat/'
labelfilename = 'training_label.json'

In [3]:
#load dictionary

with open("DIC_word_index.json") as f:
    DIC_word_index = json.load(f)
    
with open("DIC_index_word.json") as f:
    DIC_index_word = json.load(f)

#DIC_index_word = {}

In [4]:
# TODO: implement Sent2Seq
# hyperparameter: min count > 3 (discard terms with freq <= 3)
def Sent2Seq(sent):
    print(sent)
    tokens = word_tokenize(sent.lower())
    ret = []
    print(tokens)
    for word in tokens:
        #print(word)
        ret.append(DIC_word_index[word])
    #input("")
    return ret

In [5]:

#add a space for split function easy to handle
BOS = "<bos>"
EOS = "<eos>"
PAD = "<pad>"
UNK = "<unk>"

BOS_index = DIC_word_index[BOS]
EOS_index = DIC_word_index[EOS]

# loading training data
encode_x = []
video_id = {}
for i,lb in enumerate(open(idfilename)):
    #lb contains '\n', therefore lb[:-1]
    lb = lb[:-1]
    x = np.load(datadirname + lb + ".npy")
    encode_x.append(x)
    video_id[lb] = i
    
TRAIN_SZ = len(encode_x)
decode_x = [[]]*TRAIN_SZ
decode_y = [[]]*TRAIN_SZ

MAX_SEQ_LEN = 0;

# loading decoder data
rawlabels = json.load(open(labelfilename, 'r'))
for data in rawlabels:
    
    index = video_id[data['id']]
    print(index)
    sent =  data['caption'][0] # select one sentence for now
    # TODO: implement Sent2Seq
    decode_x[index] = [BOS_index] + Sent2Seq(sent)
    decode_y[index] = Sent2Seq(sent) + [EOS_index]
    if(len(decode_x[index]) > MAX_SEQ_LEN):
        MAX_SEQ_LEN = len(decode_x[index])


0
A woman goes under a horse.
['a', 'woman', 'goes', 'under', 'a', 'horse', '.']
A woman goes under a horse.
['a', 'woman', 'goes', 'under', 'a', 'horse', '.']
1
A man slicing butter into a bowl.
['a', 'man', 'slicing', 'butter', 'into', 'a', 'bowl', '.']
A man slicing butter into a bowl.
['a', 'man', 'slicing', 'butter', 'into', 'a', 'bowl', '.']
2
A raccoon-like animal is hanging upside down from the back of a chair and eating something white.
['a', 'raccoon-like', 'animal', 'is', 'hanging', 'upside', 'down', 'from', 'the', 'back', 'of', 'a', 'chair', 'and', 'eating', 'something', 'white', '.']
A raccoon-like animal is hanging upside down from the back of a chair and eating something white.
['a', 'raccoon-like', 'animal', 'is', 'hanging', 'upside', 'down', 'from', 'the', 'back', 'of', 'a', 'chair', 'and', 'eating', 'something', 'white', '.']
3
A man is putting pepper into a bowl.
['a', 'man', 'is', 'putting', 'pepper', 'into', 'a', 'bowl', '.']
A man is putting pepper into a bowl.
['

Someone is slicing a green tomato with a knife.
['someone', 'is', 'slicing', 'a', 'green', 'tomato', 'with', 'a', 'knife', '.']
393
a man riding a bike on a half pipe
['a', 'man', 'riding', 'a', 'bike', 'on', 'a', 'half', 'pipe']
a man riding a bike on a half pipe
['a', 'man', 'riding', 'a', 'bike', 'on', 'a', 'half', 'pipe']
394
A cat is eating from a cup while a puppy repeatedly bites on the cat's ear.
['a', 'cat', 'is', 'eating', 'from', 'a', 'cup', 'while', 'a', 'puppy', 'repeatedly', 'bites', 'on', 'the', 'cat', "'s", 'ear', '.']
A cat is eating from a cup while a puppy repeatedly bites on the cat's ear.
['a', 'cat', 'is', 'eating', 'from', 'a', 'cup', 'while', 'a', 'puppy', 'repeatedly', 'bites', 'on', 'the', 'cat', "'s", 'ear', '.']
395
A train is going up and down in circles on a carnival ride.
['a', 'train', 'is', 'going', 'up', 'and', 'down', 'in', 'circles', 'on', 'a', 'carnival', 'ride', '.']
A train is going up and down in circles on a carnival ride.
['a', 'train', 'is', '

A boy is playing a grand piano.
['a', 'boy', 'is', 'playing', 'a', 'grand', 'piano', '.']
793
A person pours a liquid into a spoon.
['a', 'person', 'pours', 'a', 'liquid', 'into', 'a', 'spoon', '.']
A person pours a liquid into a spoon.
['a', 'person', 'pours', 'a', 'liquid', 'into', 'a', 'spoon', '.']
794
A cat is playing with a large watermelon.
['a', 'cat', 'is', 'playing', 'with', 'a', 'large', 'watermelon', '.']
A cat is playing with a large watermelon.
['a', 'cat', 'is', 'playing', 'with', 'a', 'large', 'watermelon', '.']
795
A man puts cheese on a pizza.
['a', 'man', 'puts', 'cheese', 'on', 'a', 'pizza', '.']
A man puts cheese on a pizza.
['a', 'man', 'puts', 'cheese', 'on', 'a', 'pizza', '.']
796
A woman unwraps some foodstuff on a piece of wood.
['a', 'woman', 'unwraps', 'some', 'foodstuff', 'on', 'a', 'piece', 'of', 'wood', '.']
A woman unwraps some foodstuff on a piece of wood.
['a', 'woman', 'unwraps', 'some', 'foodstuff', 'on', 'a', 'piece', 'of', 'wood', '.']
797
A woman 

['a', 'man', 'riding', 'up', 'an', 'escalator', 'drops', 'a', 'black', 'bag', 'down', 'the', 'escalator', '.']
A man riding up an escalator drops a black bag down the escalator.
['a', 'man', 'riding', 'up', 'an', 'escalator', 'drops', 'a', 'black', 'bag', 'down', 'the', 'escalator', '.']
1193
A young girl is pushing a toy stroller in a store.
['a', 'young', 'girl', 'is', 'pushing', 'a', 'toy', 'stroller', 'in', 'a', 'store', '.']
A young girl is pushing a toy stroller in a store.
['a', 'young', 'girl', 'is', 'pushing', 'a', 'toy', 'stroller', 'in', 'a', 'store', '.']
1194
Zebra are running in an enclosed area.
['zebra', 'are', 'running', 'in', 'an', 'enclosed', 'area', '.']
Zebra are running in an enclosed area.
['zebra', 'are', 'running', 'in', 'an', 'enclosed', 'area', '.']
1195
Someone, standing at a sink, is peeling raw shrimp with their fingers and putting the peels in a small plastic container.
['someone', ',', 'standing', 'at', 'a', 'sink', ',', 'is', 'peeling', 'raw', 'shrimp',

In [6]:
VOCAB_SZ = len(DIC_word_index) # maybe? need statistics

In [7]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
# data1 = pad_sequences(data1, maxlen=MAX_IN_LEN, padding='pre', truncating='pre')

#decode_x will be the same len as decode_y
print(len(decode_x))
print(len(decode_x[0]))
decode_x = pad_sequences(decode_x, maxlen=MAX_SEQ_LEN, padding='pre', truncating='pre')
decode_y = pad_sequences(decode_y, maxlen=MAX_SEQ_LEN, padding='pre', truncating='pre')

#print(decode_x)
#print(decode_y)
print("MAX_X_LEN:%d"%(MAX_SEQ_LEN))
# decode_x = decode_x.reshape(decode_x.shape[0],decode_x.shape[1], 1)
decode_y = decode_y.reshape(decode_y.shape[0],decode_y.shape[1], 1)
decode_x = to_categorical(decode_x, num_classes=VOCAB_SZ)
# decode_y = to_categorical(decode_y, num_classes=VOCAB_SZ)
print(decode_x.shape)
print(decode_y.shape)
print(TRAIN_SZ)
#input("")

1450
8
MAX_X_LEN:45
(1450, 45, 6087)
(1450, 45, 1)
1450


In [8]:
# Using sparse_categorical_crossentropy, we only need to pass integers as input to decoder.
EncoderDIM = 256
DecoderDIM = 256

#decode_x = decode_x[:MAX_SEQ_LEN,:]
#decode_y = decode_y[:MAX_SEQ_LEN,:]

# Layers
t_encoder_input = Input(shape=(80, 4096), name="EncoderInput")
t_decoder_input = Input(shape=(MAX_SEQ_LEN,VOCAB_SZ), name="DecoderInput")
L_encoder = CuDNNGRU(EncoderDIM, return_state=True, name='Encoder')
L_decoder = CuDNNGRU(DecoderDIM, return_sequences=True, return_state=True, name='Decoder')
L_Dense = Dense(VOCAB_SZ, name="Dense", activation='softmax')

# tensors
t_encoder_outputs, state_h = L_encoder(t_encoder_input)
t_decoder_outputs, _ = L_decoder(t_decoder_input, initial_state=state_h)
t_out_probs = TimeDistributed(L_Dense)(t_decoder_outputs)


model = Model(inputs=[t_encoder_input, t_decoder_input], outputs=t_out_probs)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
EncoderInput (InputLayer)       (None, 80, 4096)     0                                            
__________________________________________________________________________________________________
DecoderInput (InputLayer)       (None, 45, 6087)     0                                            
__________________________________________________________________________________________________
Encoder (CuDNNGRU)              [(None, 256), (None, 3343872     EncoderInput[0][0]               
__________________________________________________________________________________________________
Decoder (CuDNNGRU)              [(None, 45, 256), (N 4872960     DecoderInput[0][0]               
                                                                 Encoder[0][1]                    
__________

In [None]:

optimizer = Adam(lr=1e-3)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
"""
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
 earlystp = EarlyStopping(monitor="loss", patience=20, verbose=1, mode="auto")
checkpoint = ModelCheckpoint(model_name+'_{epoch:02d}.hdf5', monitor='val_loss', \
                             verbose=0, save_best_only=True, save_weights_only=False, \
                             mode='auto', period=1)
lrreduc = ReduceLROnPlateau(monitor='loss', factor=0.5,\
                             patience=5, min_lr=0.00001, verbose=1, cooldown=5)
"""
model.fit(x=[encode_x, decode_x], y=decode_y, batch_size=1, epochs=200) #callbacks=[]

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 

In [None]:
model.save_weights('modelv1.h5')

In [9]:
# EncoderDIM = 128
# DecoderDIM = 128

# Encoder model
# Layers
t_encoder_input = Input(shape=(80, 4096), name="EncoderInput")
L_encoder = CuDNNGRU(EncoderDIM, return_state=True, name='Encoder')
# tensors
t_encoder_outputs, state_h = L_encoder(t_encoder_input)
emodel = Model(inputs=t_encoder_input, outputs=state_h)
emodel.summary()
emodel.load_weights('modelv1.h5', by_name=True)


# Decoder model
# Layers
t_decoder_input = Input(shape=(MAX_SEQ_LEN,VOCAB_SZ), name="DecoderInput")
t_decoder_state_input = Input(shape=(DecoderDIM,), name="DecoderStateInput")
L_decoder = CuDNNGRU(DecoderDIM, return_sequences=TWrue, return_state=True, name='Decoder')
L_Dense = Dense(VOCAB_SZ, name="Dense", activation='softmax')

# tensors
t_decoder_outputs, t_decode_state = L_decoder(t_decoder_input, initial_state=t_decoder_state_input)
t_out_probs = TimeDistributed(L_Dense)(t_decoder_outputs)


dmodel = Model(inputs=[t_decoder_input, t_decoder_state_input], outputs=[t_out_probs, t_decode_state])
dmodel.summary()
dmodel.load_weights('modelv1.h5', by_name=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
EncoderInput (InputLayer)    (None, 80, 4096)          0         
_________________________________________________________________
Encoder (CuDNNGRU)           [(None, 256), (None, 256) 3343872   
Total params: 3,343,872
Trainable params: 3,343,872
Non-trainable params: 0
_________________________________________________________________
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
DecoderInput (InputLayer)       (None, 45, 6087)     0                                            
__________________________________________________________________________________________________
DecoderStateInput (InputLayer)  (None, 256)          0                                            
_________________________________

In [10]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

BOS = '<bos>'
EOS = '<eos>'

def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = emodel.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = pad_sequences([[ DIC_word_index[BOS] ]], maxlen=MAX_SEQ_LEN, padding='post', truncating='post')
    

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = []
    t = 0
    while not stop_condition:
        target_seq_cat = to_categorical(target_seq, num_classes=VOCAB_SZ)

        output_tokens, h = dmodel.predict([target_seq_cat, states_value])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
#         sampled_token_index = np.argmax(output_tokens[0, 0, :])
        sampled_word = DIC_index_word[str(sampled_token_index)]
        decoded_sentence.append(sampled_word)
        print(decoded_sentence)

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_word == EOS or len(decoded_sentence) >= MAX_SEQ_LEN):
            stop_condition = True
        else:
            # Update the target sequence (of length 1).
            t += 1
            target_seq[0][t] = sampled_token_index

            # Update states
            states_value = h

    return decoded_sentence

In [12]:
test = encode_x[0].reshape(1, encode_x[0].shape[0],encode_x[0].shape[1])
decode_sequence(test)
print(len(test))

['avoiding']
['avoiding', 'avoiding']
['avoiding', 'avoiding', 'kill']
['avoiding', 'avoiding', 'kill', 'pealing']
['avoiding', 'avoiding', 'kill', 'pealing', 'mincing']
['avoiding', 'avoiding', 'kill', 'pealing', 'mincing', 'outstretched']
['avoiding', 'avoiding', 'kill', 'pealing', 'mincing', 'outstretched', 'lorie']
['avoiding', 'avoiding', 'kill', 'pealing', 'mincing', 'outstretched', 'lorie', 'lorie']
['avoiding', 'avoiding', 'kill', 'pealing', 'mincing', 'outstretched', 'lorie', 'lorie', 'avoiding']
['avoiding', 'avoiding', 'kill', 'pealing', 'mincing', 'outstretched', 'lorie', 'lorie', 'avoiding', 'avoiding']
['avoiding', 'avoiding', 'kill', 'pealing', 'mincing', 'outstretched', 'lorie', 'lorie', 'avoiding', 'avoiding', 'avoiding']
['avoiding', 'avoiding', 'kill', 'pealing', 'mincing', 'outstretched', 'lorie', 'lorie', 'avoiding', 'avoiding', 'avoiding', 'pealing']
['avoiding', 'avoiding', 'kill', 'pealing', 'mincing', 'outstretched', 'lorie', 'lorie', 'avoiding', 'avoiding', 'a

In [None]:
i = DIC_word_index['amanplaysaguitar']
DIC_index_word[str(i)]