### Imports

In [1]:
import keras
from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing import image
from keras.models import Model, Sequential
from keras.layers import Dense, GlobalAveragePooling2D
from keras.layers import LSTM, Embedding, TimeDistributed, RepeatVector, Activation, Flatten
from keras.optimizers import RMSprop
import numpy as np
import pickle
import sys
import os
os.chdir('../Anthony/')
from pickle_utils import pickle_load, pickle_dump
os.chdir('../lipika/')

Using TensorFlow backend.


### Load Embeddings

In [2]:
# load embedding shit
glove_index_dict, embeddings = pickle_load("../Anthony/glove_objs.pkl")

In [4]:
# input parameters
img_shape = (300,300,3)
vocab_size = embeddings.shape[0]
embedding_size = 300
maxlen = 20                         # maximum length of the caption in hidden state

hidden_units = embedding_size       # length of word vectors i.e. embedding size
en_shape = maxlen
de_shape = maxlen

# hyper params
clip_norm = 1.0

In [3]:
# create the base pre-trained model Inception V3 without final output layer
cnnModel = InceptionV3(weights='imagenet', 
                       include_top=False,        # this removes the final layer
                       input_shape=img_shape, 
                       pooling = 'avg')

# freeze all convolutional InceptionV3 layers
for layer in cnnModel.layers:
    layer.trainable = False

# make input layers for model definition
inputImg = keras.Input(shape=img_shape)    # input layer for CNN
embLabels = keras.Input(shape=(300,))          # input layer with the avg label word embedding
# image embedding
embImage = cnnModel(inputImg)

# image and word embeddings concatenation to build a 2348 dimensional layer
concat = keras.layers.Concatenate(axis=1)([embImage, embLabels])
totalEmbeddingLayer = Dense(300, activation='relu')(concat)

# add softmax layer to get probabilities

softmax1 = Dense(300,activation = 'softmax', name = 'softmax_encoder')(totalEmbeddingLayer)
# multiply softmax probabilities with the image embedding to 
# get probability weighted vector that gets us the input to the LSTM

lstm_hidden_input = keras.layers.Multiply()([totalEmbeddingLayer, softmax1])

encoder = keras.Model(inputs=[inputImg, embLabels], outputs=lstm_hidden_input)

initial_state_LSTM = encoder([inputImg, embLabels])

#Input to the decoder would be the caption sequence starting from <START> character and ending in <END> character
decoder_inputs = keras.Input(shape = (de_shape,))

# make a trainable embedding layer that uses the GloVe embeddings but still allows training
input_caption_emb = Embedding(input_dim=vocab_size, output_dim=embedding_size,
                              input_length=maxlend, W_regularizer = None,
                              weights = [embeddings], name = 'caption_embeddings', 
                              trainable = True # making this a trainable embedding 
                                               # layer that's initialized using GloVe
                              )

decoder_LSTM = LSTM(hidden_units,return_sequences=True, return_state=True)

decoder_outputs, _ , _ = decoder_LSTM(input_caption_emb(decoder_inputs), 
                                      initial_state = [initial_state_LSTM, initial_state_LSTM])

# Apply a dense layer that has vocab_size(40000 ish) outputs which learns probability of each word when softmax is applied.
# TimeDistributed is a wrapper for applying the same function over all the time step outputs. 
# Refer https://keras.io/layers/wrappers/
time_distributed = TimeDistributed(Dense(vocab_size, name = 'timedistributed_1'))
activation = Activation('softmax')
decoder_outputs = activation(time_distributed(decoder_outputs))

#Model groups layers into an object with training and inference features.
#https://www.tensorflow.org/api_docs/python/tf/keras/models/Model        
model= Model(inputs=[inputImg, embLabels,decoder_inputs], outputs=decoder_outputs)
rmsprop = RMSprop(lr=0.1,clipnorm=clip_norm)
model.compile(loss='categorical_crossentropy',optimizer=rmsprop)


Instructions for updating:
Colocations handled automatically by placer.
