### Imports

In [1]:
import keras
from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing import image
from keras.models import Model, Sequential
from keras.layers import Dense, GlobalAveragePooling2D
from keras.layers import LSTM, Embedding, TimeDistributed, RepeatVector, Activation, Flatten
from keras.optimizers import RMSprop

from keras import backend as K

import numpy as np
import pickle
import sys
import os
os.chdir('../Anthony/')
from pickle_utils import pickle_load, pickle_dump
os.chdir('../lipika/')

Using TensorFlow backend.


## Encoder 
### CNN (Inception V3 w/o output layer)

- We removed the final output layer that uses sigmoid activation in Inception V3 for classification
- Might be worth adding this back to use the trained weights 

In [2]:
# create the base pre-trained model Inception V3 without final output layer
cnnModel = InceptionV3(weights='imagenet', include_top=False, input_shape=(300,300,3), pooling = 'avg')

Instructions for updating:
Colocations handled automatically by placer.


In [3]:
cnnModel.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 300, 300, 3)  0                                            
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 149, 149, 32) 864         input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, 149, 149, 32) 96          conv2d_1[0][0]                   
__________________________________________________________________________________________________
activation_1 (Activation)       (None, 149, 149, 32) 0           batch_normalization_1[0][0]      
__________________________________________________________________________________________________
conv2d_2 (

In [4]:
# freeze all convolutional InceptionV3 layers
for layer in cnnModel.layers:
    layer.trainable = False

### Image Caption Embeddings with GloVe

- In theory we could have done this using Keras but it would have increased training time
- Use the output of the getAvgEmbeddings as input to the upcoming concatenate layer


In [5]:
# def loadGloVeMapping(gloveFile):
#     '''
#     Loads GloVe model into a dictionary
    
#     Parameters
#     ==========
#     - Filename as a string e.g. 'model.txt'
  
#     Returns
#     =======
#     - Model as a dictionary with word as a string as the key and the 300 dimensional embedding as the value
    
#     '''
#     f = open(gloveFile,'r')
#     model = {}
#     for line in f:
#         splitLine = line.split()
#         word = splitLine[0]
#         embedding = np.array([float(val) for val in splitLine[1:]])
#         model[word] = embedding
#     print("Done.",len(model)," words loaded!")
#     return model, len(model)

In [6]:
# ## Loading the huge txt file, this takes about 2 min
# gloveFileName = ' glove.txt'
# embeddingsMap, vocab_size = loadGloVeMapping(gloveFileName)

# # vocab_size = 1917494

In [7]:
# # =========
# # Make embeddings matrix - TEMP!!!!!
# # =========
# embeddings = np.full((vocab_size, 300), fill_value=1)
# for key, value in enumerate(embeddingsMap.values()):
#     embeddings[key,:]=(value)

In [8]:
# def getAvgEmbeddings(label, emap):
#     '''
#     Takes an image label and returns the average embeddings for all words in the label
    
#     Parameters
#     ==========
#     - label: image label as a list of strings, e.g. ['success', 'kid']
#     - emap: embeddings map from GloVe
    
#     Returns
#     =======
#     numpy array with average embeddings for all words in the image label
#     '''
#     embeddings = [emap[word] for word in label]
#     return np.mean(np.array(embeddings), axis=0)

In [9]:
# ### CHECK THAT EMBEDDING SHIT WORKS
# memeLabeleg = ['success', 'kid']

# check = getAvgEmbeddings(memeLabeleg, embeddingsMap)
# check.shape

In [10]:
glove_index_dict, embeddings = pickle_load("../Anthony/glove_objs.pkl")

In [21]:
embeddings = embeddings[:40000,]
embeddings.shape

(40000, 300)

In [22]:
len(glove_index_dict)

1917494


### Use Dense layer with (2048 + 300) as input

In [13]:
# make input layers for model definition
inputImg = keras.Input(shape=(300, 300, 3))    # input layer for CNN
embLabels = keras.Input(shape=(300,))          # input layer with the avg label word embedding
# image embedding
embImage = cnnModel(inputImg)

# image and word embeddings concatenation to build a 2348 dimensional layer
concat = keras.layers.Concatenate(axis=1)([embImage, embLabels])

totalEmbeddingLayer = Dense(300, activation='relu')(concat)

In [14]:
# encoder = keras.Model(inputs=[inputImg, embLabels], outputs=totalEmbeddingLayer)
# encoder.summary()

In [15]:
# add softmax layer to get probabilities

softmax1 = Dense(300,activation = 'softmax', name = 'softmax_encoder')(totalEmbeddingLayer)
# multiply softmax probabilities with the image embedding to 
# get probability weighted vector that gets us the input to the LSTM

lstm_hidden_input = keras.layers.Multiply()([totalEmbeddingLayer, softmax1])

In [16]:
encoder = keras.Model(inputs=[inputImg, embLabels], outputs=lstm_hidden_input)

initial_state_LSTM = encoder([inputImg, embLabels])

In [17]:
encoder.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 300, 300, 3)  0                                            
__________________________________________________________________________________________________
inception_v3 (Model)            (None, 2048)         21802784    input_2[0][0]                    
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 300)          0                                            
__________________________________________________________________________________________________
concatenate_3 (Concatenate)     (None, 2348)         0           inception_v3[1][0]               
                                                                 input_3[0][0]                    
__________

----

## Decoder

### LSTM

In [18]:
hidden_units = 300       #length of word vectors i.e. embedding size
maxlenh = 20            # maximum length of the caption in hidden state
maxlend=20              # max length of caption in decoder
maxlen = maxlenh+maxlend

vocab_size = 40000
embedding_size = 300
en_shape = maxlend
de_shape = maxlenh

In [23]:
#Input to the decoder would be the caption sequence starting from <START> character and ending in <END> character
decoder_inputs = keras.Input(shape = (de_shape,))

# make a trainable embedding layer that uses the GloVe embeddings but still allows training
input_caption_emb = Embedding(input_dim=vocab_size, output_dim=embedding_size,
                              input_length=maxlend, W_regularizer = None,
                              weights = [embeddings], name = 'caption_embeddings', 
                              trainable = True # making this a trainable embedding 
                                               # layer that's initialized using GloVe
                              )

decoder_LSTM = LSTM(hidden_units,return_sequences=True, return_state=True)

decoder_outputs, _ , _ = decoder_LSTM(input_caption_emb(decoder_inputs), 
                                      initial_state = [initial_state_LSTM, initial_state_LSTM])

  


In [24]:
# Apply a dense layer that has vocab_size(40000) outputs which learns probability of each word when softmax is applied.
# TimeDistributed is a wrapper for applying the same function over all the time step outputs. 
# Refer https://keras.io/layers/wrappers/
time_distributed = TimeDistributed(Dense(vocab_size, name = 'timedistributed_1'))
activation = Activation('softmax')
decoder_outputs = activation(time_distributed(decoder_outputs))

In [25]:
vocab_size

40000

---
## Final model build

In [26]:
#Model groups layers into an object with training and inference features.
#https://www.tensorflow.org/api_docs/python/tf/keras/models/Model        
model= Model(inputs=[inputImg, embLabels,decoder_inputs], outputs=decoder_outputs)
rmsprop = RMSprop(lr=0.1) #,clipnorm=clip_norm)
model.compile(loss='categorical_crossentropy',optimizer=rmsprop)

In [27]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 300, 300, 3)  0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 300)          0                                            
__________________________________________________________________________________________________
caption_embeddings (Embedding)  (None, 20, 300)      12000000    input_5[0][0]                    
__________________________________________________________________________________________________
model_1 (M