In [1]:
from tensorflow.keras.layers import Input, Softmax, Dense
from tensorflow.keras.layers import CuDNNGRU, CuDNNLSTM
from tensorflow.keras.layers import Bidirectional, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import numpy as np
import json
import re

In [2]:
idfilename = 'training_data/id.txt'
datadirname = 'training_data/feat/'
labelfilename = 'training_label.json'

In [3]:
#load dictionary

with open("DIC_word_index.json") as f:
    DIC_word_index = json.load(f)
    
with open("DIC_index_word.json") as f:
    DIC_index_word = json.load(f)

#DIC_index_word = {}

In [4]:
# TODO: implement Sent2Seq
# hyperparameter: min count > 3 (discard terms with freq <= 3)
def Sent2Seq(sent):
    print(sent)
    sent = re.sub(r'[^\w\s\<\>\-]','',sent)
    tokens = sent.lower().split()
    ret = []
    print(tokens)
    for word in tokens:
        #print(word)
        ret.append(DIC_word_index[word])
    #input("")
    return ret

In [5]:

#add a space for split function easy to handle
BOS = "<bos> "
EOS = " <eos>"

# loading training data
encode_x = []
video_id = {}
for i,lb in enumerate(open(idfilename)):
    #lb contains '\n', therefore lb[:-1]
    lb = lb[:-1]
    x = np.load(datadirname + lb + ".npy")
    encode_x.append(x)
    video_id[lb] = i
    
TRAIN_SZ = len(encode_x)
decode_x = [[]]*TRAIN_SZ
decode_y = [[]]*TRAIN_SZ

MAX_SEQ_LEN = 0;

# loading decoder data
rawlabels = json.load(open(labelfilename, 'r'))
for data in rawlabels:
    
    index = video_id[data['id']]
    print(index)
    sent =  data['caption'][0] # select one sentence for now
    # TODO: implement Sent2Seq
    decode_x[index] = Sent2Seq(BOS+sent)
    decode_y[index] = Sent2Seq(sent+EOS)
    if(len(decode_x[index]) > MAX_SEQ_LEN):
        MAX_SEQ_LEN = len(decode_x[index])


0
<bos> A woman goes under a horse.
['<bos>', 'a', 'woman', 'goes', 'under', 'a', 'horse']
A woman goes under a horse. <eos>
['a', 'woman', 'goes', 'under', 'a', 'horse', '<eos>']
1
<bos> A man slicing butter into a bowl.
['<bos>', 'a', 'man', 'slicing', 'butter', 'into', 'a', 'bowl']
A man slicing butter into a bowl. <eos>
['a', 'man', 'slicing', 'butter', 'into', 'a', 'bowl', '<eos>']
2
<bos> A raccoon-like animal is hanging upside down from the back of a chair and eating something white.
['<bos>', 'a', 'raccoon-like', 'animal', 'is', 'hanging', 'upside', 'down', 'from', 'the', 'back', 'of', 'a', 'chair', 'and', 'eating', 'something', 'white']
A raccoon-like animal is hanging upside down from the back of a chair and eating something white. <eos>
['a', 'raccoon-like', 'animal', 'is', 'hanging', 'upside', 'down', 'from', 'the', 'back', 'of', 'a', 'chair', 'and', 'eating', 'something', 'white', '<eos>']
3
<bos> A man is putting pepper into a bowl.
['<bos>', 'a', 'man', 'is', 'putting', 

A man is playing the trumpet. <eos>
['a', 'man', 'is', 'playing', 'the', 'trumpet', '<eos>']
593
<bos> The man put a necklace around a woman's neck during a ceremony.
['<bos>', 'the', 'man', 'put', 'a', 'necklace', 'around', 'a', 'womans', 'neck', 'during', 'a', 'ceremony']
The man put a necklace around a woman's neck during a ceremony. <eos>
['the', 'man', 'put', 'a', 'necklace', 'around', 'a', 'womans', 'neck', 'during', 'a', 'ceremony', '<eos>']
594
<bos> A man is slicing carrots.
['<bos>', 'a', 'man', 'is', 'slicing', 'carrots']
A man is slicing carrots. <eos>
['a', 'man', 'is', 'slicing', 'carrots', '<eos>']
595
<bos> A giant prepares his tea with a girl in it.
['<bos>', 'a', 'giant', 'prepares', 'his', 'tea', 'with', 'a', 'girl', 'in', 'it']
A giant prepares his tea with a girl in it. <eos>
['a', 'giant', 'prepares', 'his', 'tea', 'with', 'a', 'girl', 'in', 'it', '<eos>']
596
<bos> Three young men run, jump, and kick off of a Coke machine.
['<bos>', 'three', 'young', 'men', 'run'

A baby rhino is walking behind an adult rhino. <eos>
['a', 'baby', 'rhino', 'is', 'walking', 'behind', 'an', 'adult', 'rhino', '<eos>']
1293
<bos> A yellow train is speeding down a track.
['<bos>', 'a', 'yellow', 'train', 'is', 'speeding', 'down', 'a', 'track']
A yellow train is speeding down a track. <eos>
['a', 'yellow', 'train', 'is', 'speeding', 'down', 'a', 'track', '<eos>']
1294
<bos> A man breaks a slab of concrete in half that is lying on top of a prone man.
['<bos>', 'a', 'man', 'breaks', 'a', 'slab', 'of', 'concrete', 'in', 'half', 'that', 'is', 'lying', 'on', 'top', 'of', 'a', 'prone', 'man']
A man breaks a slab of concrete in half that is lying on top of a prone man. <eos>
['a', 'man', 'breaks', 'a', 'slab', 'of', 'concrete', 'in', 'half', 'that', 'is', 'lying', 'on', 'top', 'of', 'a', 'prone', 'man', '<eos>']
1295
<bos> A woman is lying on top of a man at the bow of a canoe while two other people are paddling with their hands.
['<bos>', 'a', 'woman', 'is', 'lying', 'on', '

In [6]:
VOCAB_SZ = len(DIC_word_index) # maybe? need statistics

In [7]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
# data1 = pad_sequences(data1, maxlen=MAX_IN_LEN, padding='pre', truncating='pre')

#decode_x will be the same len as decode_y
print(len(decode_x))
print(len(decode_x[0]))
decode_x = pad_sequences(decode_x, maxlen=MAX_SEQ_LEN, padding='pre', truncating='pre')
decode_y = pad_sequences(decode_y, maxlen=MAX_SEQ_LEN, padding='pre', truncating='pre')

#print(decode_x)
#print(decode_y)
print("MAX_X_LEN:%d"%(MAX_SEQ_LEN))
# decode_x = decode_x.reshape(decode_x.shape[0],decode_x.shape[1], 1)
decode_y = decode_y.reshape(decode_y.shape[0],decode_y.shape[1], 1)
decode_x = to_categorical(decode_x, num_classes=VOCAB_SZ)
# decode_y = to_categorical(decode_y, num_classes=VOCAB_SZ)
print(decode_x.shape)
print(decode_y.shape)
print(TRAIN_SZ)
#input("")

1450
7
MAX_X_LEN:41
(1450, 41, 6093)
(1450, 41, 1)
1450


In [8]:
# Using sparse_categorical_crossentropy, we only need to pass integers as input to decoder.
EncoderDIM = 128
DecoderDIM = 128

#decode_x = decode_x[:MAX_SEQ_LEN,:]
#decode_y = decode_y[:MAX_SEQ_LEN,:]

# Layers
t_encoder_input = Input(shape=(80, 4096), name="EncoderInput")
t_decoder_input = Input(shape=(MAX_SEQ_LEN,VOCAB_SZ), name="DecoderInput")
L_encoder = CuDNNGRU(EncoderDIM, return_state=True, name='Encoder')
L_decoder = CuDNNGRU(DecoderDIM, return_sequences=True, name='Decoder')
L_Dense = Dense(VOCAB_SZ, name="Dense", activation='softmax')
# L_SM = Softmax(axis=-1, name="Softmax")

# tensors
t_encoder_outputs, state_h = L_encoder(t_encoder_input)
t_decoder_outputs = L_decoder(t_decoder_input, initial_state=state_h)
t_out_probs = TimeDistributed(L_Dense)(t_decoder_outputs)


model = Model(inputs=[t_encoder_input, t_decoder_input], outputs=t_out_probs)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
EncoderInput (InputLayer)       (None, 80, 4096)     0                                            
__________________________________________________________________________________________________
DecoderInput (InputLayer)       (None, 41, 6093)     0                                            
__________________________________________________________________________________________________
Encoder (CuDNNGRU)              [(None, 128), (None, 1622784     EncoderInput[0][0]               
__________________________________________________________________________________________________
Decoder (CuDNNGRU)              (None, 41, 128)      2389632     DecoderInput[0][0]               
                                                                 Encoder[0][1]                    
__________

In [None]:

optimizer = Adam(lr=1e-3)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(x=[encode_x, decode_x], y=decode_y, batch_size=1, epochs=200) #callbacks=[]

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200