### Imports

In [1]:
import keras
from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing import image
from keras.models import Model, Sequential
from keras.layers import Dense, GlobalAveragePooling2D
from keras.layers import LSTM, Embedding, TimeDistributed, RepeatVector, Activation, Flatten
from keras.optimizers import RMSprop
import numpy as np
import pickle
import sys
import os
os.chdir('../Anthony/')
from pickle_utils import pickle_load, pickle_dump
os.chdir('../lipika/')

Using TensorFlow backend.


### Load Embeddings

In [2]:
# load embedding shit
glove_index_dict, embeddings = pickle_load("../Anthony/glove_objs.pkl")

In [4]:
# input parameters
img_shape = (300,300,3)
vocab_size = embeddings.shape[0]
embedding_size = 300
maxlen = 20                         # maximum length of the caption in hidden state

hidden_units = embedding_size       # length of word vectors i.e. embedding size

# hyper params
clip_norm = 1.0
learning_rate = 0.1

In [6]:
def initial_model(img_shape,
                  vocab_size, 
                  embedding_size, 
                  maxlen, 
                  hidden_units, 
                  clip_norm
                  learning_rate):
    
    # =============
    #   ENCODER
    # =============
    # Inputs: 
    #     1. Image (300, 300, 3)
    #     2. GloVe-ed label embeddings (300,)

    # 1. Image Input
    input_img = keras.Input(shape=img_shape)
    # 2. Label Embedding
    label_emb = keras.Input(shape=(300,))

    # Model:
    #     1. Pretrained CNN with classification layer peeled off
    #           - Output size: (2048,)
    #     2. Concatenate with label embeding of size
    #           -  Output size: (2348,)
    #     3. MLP: Dense layer with 300 nodes
    #           - Output size: (300,) <-- This is the image embedding
    
    
    # 1. Define Pretrained CNN - Inception V3
    cnnModel = InceptionV3(weights='imagenet', 
                           include_top=False,        # this removes the final layer
                           input_shape=img_shape, 
                           pooling = 'avg')

    # freeze all convolutional InceptionV3 layers
    for layer in cnnModel.layers:
        layer.trainable = False

    # Get image embedding <- this is a model output
    image_emb = cnnModel(input_img)

    # 2. Concatenate image embedding with label embeding of size
    concat = keras.layers.Concatenate(axis=1)([image_emb, label_emb])
    
    # 3. MLP with 300 nodes
    full_img_embedding = Dense(300, activation='relu')(concat)

    # Attention Implementation
    #     1. MLP with 300 nodes (so weights can be learned) w *softmax* activation to get importance probability
    #     2. Multiply output of the full_img_embedding layer (Model part 3.) with "probabilities"
    
    # 1. MLP with 300 nodes w softmax
    softmax_encoder = Dense(300,activation = 'softmax', name = 'softmax_encoder')(full_img_embedding)
    
    # 2. Multiply layer i.e. probability weighted vector for naive impl of attention
    attention_encoder = keras.layers.Multiply()([totalEmbeddingLayer, softmax_encoder])

    # ==== ENCODER MODEL ====
    encoder = keras.Model(inputs=[input_img, label_emb], outputs=attention_encoder)

    # =============
    #   DECODER
    # =============
    # Inputs: 
    #     1. Caption (tokenized) (20,) <- limiting caption length to 20
    #     2. LSTM hidden state from encoder
    
    # 1. Caption
    input_caption = keras.Input(shape = (maxlen,))
    
    # 2. Input for the LSTM hidden state and/or cell state
    initial_state_LSTM = encoder([input_img, label_emb])
    
    # Model:
    #     1. Embedding layer that uses the GloVe embedding matrix, and is set to be trainable
    #           - Output size: (20, 300)
    #     2. LSTM
    #           -  Output size: (20, 300)
    #     3. Time Distributed layer to apply Dense layer to all the time step outputs
    #           - Output size: (20, 40000)
    #     4. Activation of softmax to get values between 0 and 1
    #           - Output size: (20, 40000)
        
    # 1. Embedding layer
    decoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_size,
                                  input_length=maxlend, W_regularizer = None,
                                  weights = [embeddings], name = 'caption_embeddings', 
                                  trainable = True)
    # 2. LSTM
    decoder_LSTM = LSTM(hidden_units,return_sequences=True, return_state=True)

    ## ===== Get embedding and LSTM outputs =====
    decoder_embedding_outputs = decoder_embedding(decoder_inputs)
    decoder_LSTM_outputs, _ , _ = decoder_LSTM(decoder_embedding_outputs, 
                                          initial_state = [initial_state_LSTM,  # hidden state
                                                           initial_state_LSTM]) # cell state
    
    # 3. Time Distributed Layer
    time_distributed = TimeDistributed(Dense(vocab_size, name = 'timedistributed_1'))
    
    # 4. Softmax 
    activation = Activation('softmax')
    
    ## ===== Get time distributed and softmax output =====
    time_distributed_output = time_distributed(decoder_outputs)
    decoder_outputs = activation(time_distributed_output)

    # ==============
    #   FULL MODEL
    # ==============   
    model= Model(inputs=[input_img, label_emb, input_caption], outputs=decoder_outputs)
    rmsprop = RMSprop(lr=learning_rate,clipnorm=clip_norm)
    model.compile(loss='categorical_crossentropy',optimizer=rmsprop)

    return encoder, model