In [17]:
import pandas as pd
import pickle
import pydot
from keras.layers import RepeatVector,Dense,Embedding,LSTM,TimeDistributed,concatenate,Activation,Dropout,Input,SpatialDropout1D,Bidirectional,BatchNormalization
from keras.models import Sequential,Model
from keras.utils import plot_model
from keras.optimizers import RMSprop
from ImageCaption import build_vocab,max_sequence
from keras.preprocessing import sequence
from keras import backend
import numpy as np

In [3]:
with open("features1.pickle","rb") as fpickle:
    encoded_img=pickle.load(fpickle)

In [4]:
dataset=pd.read_csv("Flickr8k/captions/train_dataset.txt",delimiter="\t")#read training dataset file
pd_frame=dataset.values
print(pd_frame.shape)

(30000, 2)


In [5]:
sentences=[]
for sent in range(pd_frame.shape[0]):
    sentence=pd_frame[sent][1]
    sentences.append(sentence)#append all the sentences in a list
len(sentences)    

30000

In [6]:
vocabulary,clean_desc=build_vocab(sentences,1)
vocab_size=len(vocabulary)+1
vocab_size

7669

In [7]:
total_count=sum([len(word.split())-1 for word in clean_desc])#total count of words
total_count

383266

In [8]:
#vectorization
word2id={value:index for index,value in enumerate(vocabulary)}
id2word={index:value for index,value in enumerate(vocabulary)}

word2id[id2word.get(0)]=7668
word2id["PAD"]=0

id2word[7668]=id2word.get(0)
id2word[0]="PAD"

len(list(word2id.keys()))

7669

In [14]:
#fix values for model training
embedding_size=300
max_length=max_sequence(clean_desc)
batch_size=180
steps=total_count/batch_size
steps

2129.2555555555555

In [10]:
def data_generator(encoded_img,frame,max_len,batch_size,word2id):
    images=[]
    partial_seq=[]
    next_words=[]
    batch_count=0
    while True:
    
        for idx in range(len(frame)):
            encoded_image=encoded_img[frame[idx][0]]
            encoded_txt=[word2id[text] for text in clean_desc[idx].split()]
            for i in range(1,len(encoded_txt)):
                batch_count+=1
                partial_seq.append(encoded_txt[:i])
                next_words.append(encoded_txt[i])
                images.append(encoded_image)
                
                if batch_count>=batch_size:
                    batch_count=0
                    partial_seq=sequence.pad_sequences(partial_seq,max_len,padding="post")#returns zero padded sequence
                    
                    #one hot encoding for target words
                    hotvector = np.zeros([len(next_words), vocab_size])
                    for i,next_word in enumerate(next_words):
                        hotvector[i,next_word]=1
                        
                    images=np.asarray(images)
                    next_words=np.asarray(hotvector)
                     
                    yield [[images,partial_seq],next_words]    
                    partial_seq=[]
                    next_words=[]
                    images=[]
                

In [19]:
def create_model(embedding_size,max_len,vocab_size):
    #image encoder
    input1=Input(shape=(2048,))
    image2=Dense(embedding_size, activation='relu')(input1)
    image3=RepeatVector(max_len)(image2)
    
    #text encoder
    input2=Input(shape=(max_len,))
    emb=Embedding(vocab_size, embedding_size)(input2)
    l1=LSTM(256,return_sequences=True)(emb)
    td=TimeDistributed(Dense(embedding_size))

    
    #decoder
    conc=concatenate([image3,l1])
    x=LSTM(512,return_sequences=True)(conc)
    x=LSTM(512,return_sequences=False)(x)
    x = Dense(vocab_size)(x)
    out = Activation('softmax')(x)
    model = Model(inputs=[input1,input2], outputs = out) 
    model.summary()
    
    model.compile(loss="categorical_crossentropy",optimizer=RMSprop(lr=0.0001) ,metrics=["accuracy"])
    #plot_model(model, to_file='model.png')
    #print(backend.eval(model.optimizer.lr))
    return model




In [20]:
model=create_model(embedding_size,max_length,vocab_size)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 2048)         0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            (None, 39)           0                                            
__________________________________________________________________________________________________
dense_10 (Dense)                (None, 300)          614700      input_7[0][0]                    
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 39, 300)      2300700     input_8[0][0]                    
__________________________________________________________________________________________________
repeat_vec

In [0]:
model.load_weights("180batch_60ep_newmodel_300.h5")