In [16]:
from tensorflow.keras.layers import Input, Softmax,Embedding, Dense, Dropout
from tensorflow.keras.layers import CuDNNGRU, CuDNNLSTM
from tensorflow.keras.layers import Bidirectional, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import numpy as np
import json
import re

In [12]:
idfilename = 'training_data/id.txt'
datadirname = 'training_data/feat/'
labelfilename = 'training_label.json'

with open("vocab.json") as f:
    DIC_word_index = json.load(f)
    
DIC_index_word = dict((v, k) for k,v in DIC_word_index.items())

embedding_matrix = np.load("wv_matrix100d.npy")

In [17]:
#hyper param
sent_len = 5
vocab_size = len(embedding_matrix)
embedding_dim = embedding_matrix.shape[1]


In [35]:
def pad_post_zero(a, length):
    ret = []
    for _list in a:
        if(len(_list) < length):
            for ct in range(len(_list),length,1):
                _list.append(DIC_word_index["<pad>"])
        if(len(_list) > length):
            _list = _list[:length]
            
        ret.append(_list)
    return ret


encode_x = []
with open('sel_conversation/question.txt', 'r') as f:
    for l in f:
        l = l.split()
        tmp = []
        for word in l:
            try:
                index = DIC_word_index[word]
                tmp.append(index)
                
            except KeyError:   
                tmp.append(DIC_word_index["<unk>"])
                
        encode_x.append([DIC_word_index['<bos>']] + tmp)  
                
encode_x = pad_post_zero(encode_x, sent_len)
encode_x = np.asarray(encode_x)



In [36]:
print(len(encode_x))
print(len(decode_x))

880624
880624


In [34]:
decode_x = []
decode_y = []
with open('sel_conversation/answer.txt', 'r') as f:
    for l in f:
        l = l.split()
        tmp = []
        for word in l:
            try:
                index = DIC_word_index[word]
            except KeyError:
                index = DIC_word_index['<unk>']
            tmp.append(index)
        decode_x.append([DIC_word_index['<bos>']] + tmp)    
        decode_y.append( tmp + [DIC_word_index['<eos>']])
    decode_x = pad_post_zero(decode_x, sent_len)
    decode_y = pad_post_zero(decode_y, sent_len)
    decode_x = np.asarray(decode_x)
    decode_y = np.asarray(decode_y)
    
    decode_y = decode_y.reshape(decode_y.shape[0], decode_y.shape[1], 1)

In [19]:
latent_dim = 256
#=================encoder====================#
encoder_inputs = Input(shape=(sent_len,))
Embedding_layer = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False)
encoder = CuDNNLSTM(latent_dim, return_sequences=True, return_state=True)
encode_emb = Embedding_layer(encoder_inputs)
encoder_outputs, state_h, state_c = encoder(encode_emb)
encoder_states = [state_h, state_c]

#=================decoder====================#
decoder_inputs = Input(shape=(sent_len,))
decoder_lstm = CuDNNLSTM(latent_dim, return_sequences=True, return_state=True)
decode_emb = Embedding_layer(decoder_inputs)

decoder_outputs,_ , _ = decoder_lstm(decode_emb, initial_state=encoder_states)

decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
#=============================================
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)


In [20]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 5)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 5)            0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 5, 100)       9784100     input_2[0][0]                    
                                                                 input_3[0][0]                    
__________________________________________________________________________________________________
cu_dnnlstm_1 (CuDNNLSTM)        [(None, 5, 256), (No 366592      embedding_1[0][0]                
__________

In [38]:
optimizer = Adam(lr=0.001)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=["accuracy"])
model.fit([encode_x, decode_x], decode_y, validation_split=0.1, batch_size=16, epochs=50)

Train on 792561 samples, validate on 88063 samples
Epoch 1/50
 12576/792561 [..............................] - ETA: 47:04 - loss: 5.4412 - acc: 0.2307

KeyboardInterrupt: 

In [None]:
#==============inference setup===================#
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [None]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty decode_y seq
    # y_seq shape : (1, 1, 6087)
    y_seq = np.zeros((1, 1, VOCAB_SZ))
    y_seq[0, 0, DIC_word_index["<bos>"]] = 1
    
    stop_condition = False
    decoded_sentence = []
    while not stop_condition:
        #output_tokens shape : (1, 1, 6087)
        #output_tokens[0, -1, :] shape : (6087, )
        output_tokens, h, c = decoder_model.predict(  
            [y_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :] )
        sampled_token = DIC_index_word[str(sampled_token_index)]
        if(sampled_token!='<eos>'):
            decoded_sentence.append(sampled_token)
        
        #Exit Condition :either hit max length or find stop char
        
        if(sampled_token == '<eos>' or
          len(decoded_sentence) > MAX_SEQ_LEN):
            stop_condition = True
        
        #Update y_seq
        y_seq = np.zeros((1, 1, VOCAB_SZ))
        y_seq[0, 0, sampled_token_index] = 1
        
        #Update states
        states_value = [h, c]
        
    return decoded_sentence

In [None]:
idfilename = 'testing_data/id.txt'
datadirname = 'testing_data/feat/'
labelfilename = 'testing_label.json'

encode_x = []
video_id = []
for i,lb in enumerate(open(idfilename)):
    lb = lb[:-1]
    encode_x.append(np.load(datadirname + lb +".npy"))
    video_id.append(lb)
    
out_labels = []
for indexx in range(len(encode_x)):
    sent = decode_sequence(np.array([encode_x[indexx]]))
    sent = " ".join(sent)
    print(sent)
    out_labels.append(sent)

In [None]:
with open('MODELTEST_testing.txt', 'w') as f:
    for i in range(len(encode_x)):
        f.write(video_id[i] + ',' + out_labels[i] + '\n')

In [None]:
#====================TESTING===================#

idfilename_t = 'testing_data/id.txt'
datadirname_t = 'testing_data/feat/'

# loading testing data
encode_x_t = []
video_id_t = {}
for i,video_name in enumerate(open(idfilename_t)):
    #lb contains '\n', therefore lb[:-1]
    video_name = video_name[:-1]
    x = np.load(datadirname_t + video_name + ".npy")
    encode_x_t.append(x)
    video_id_t[video_name] = i

In [None]:
#============predict the caption============#
OUTPUTS = []
for X in encode_x_t:
    X = np.array([X])
    Y = decode_sequence(X)
    OUTPUTS.append(Y)

In [None]:
#============to json============#
predict_label = []
with open('predict_label.txt', 'w') as f:
    for video_name, _id in video_id_t.items():
        tokens = OUTPUTS[int(_id)][:-2]
        predict = " ".join(tokens)
        predict +="."

        f.write(str(video_name) + "," + predict +"\n")
    

In [None]:
video_id_t