### Imports

In [23]:
import keras
from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing import image
from keras.models import Model, Sequential
from keras.layers import Dense, GlobalAveragePooling2D
from keras.layers import LSTM, Embedding, TimeDistributed, RepeatVector, Activation, Flatten

from keras import backend as K

import numpy as np
import pickle


## Encoder 
### CNN (Inception V3 w/o output layer)

- We removed the final output layer that uses sigmoid activation in Inception V3 for classification
- Might be worth adding this back to use the trained weights 

In [3]:
# create the base pre-trained model Inception V3 without final output layer
baseModel = InceptionV3(weights='imagenet', include_top=False, input_shape=(300,300,3), pooling = 'avg')

In [4]:
# add a global spatial average pooling layer
x = baseModel.output
# let's add a fully-connected layer
imgEmbeddingLayer = Dense(300, activation='relu')(x)

In [5]:
# freeze all convolutional InceptionV3 layers
for layer in baseModel.layers:
    layer.trainable = False

In [6]:
# compile intermediate model to show CNN
cnnModel = Model(inputs = baseModel.input, outputs = imgEmbeddingLayer)
cnnModel.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 300, 300, 3)  0                                            
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 149, 149, 32) 864         input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, 149, 149, 32) 96          conv2d_1[0][0]                   
__________________________________________________________________________________________________
activation_1 (Activation)       (None, 149, 149, 32) 0           batch_normalization_1[0][0]      
__________________________________________________________________________________________________
conv2d_2 (

### Image Caption Embeddings with GloVe

- In theory we could have done this using Keras but it would have increased training time
- Use the output of the getAvgEmbeddings as input to the upcoming concatenate layer


In [7]:
def loadGloVeMapping(gloveFile):
    '''
    Loads GloVe model into a dictionary
    
    Parameters
    ==========
    - Filename as a string e.g. 'model.txt'
  
    Returns
    =======
    - Model as a dictionary with word as a string as the key and the 300 dimensional embedding as the value
    
    '''
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

In [8]:
## Loading the huge txt file, this takes about 2 min
gloveFileName = ' glove.txt'
embeddingsMap = loadGloVeMapping(gloveFileName)

Done. 1917494  words loaded!


In [9]:
def getAvgEmbeddings(label, emap):
    '''
    Takes an image label and returns the average embeddings for all words in the label
    
    Parameters
    ==========
    - label: image label as a list of strings, e.g. ['success', 'kid']
    - emap: embeddings map from GloVe
    
    Returns
    =======
    numpy array with average embeddings for all words in the image label
    '''
    embeddings = [emap[word] for word in label]
    return np.mean(np.array(embeddings), axis=0)

In [10]:
### CHECK THAT EMBEDDING SHIT WORKS
memeLabeleg = ['success', 'kid']

check = getAvgEmbeddings(memeLabeleg, embeddingsMap)
check.shape

(300,)

### Get combined embedding layer in Keras


- Given how we're doing this, we need to write our own data generator. Sample code coming up.

In [11]:
# make input layers for model definition
inputImg = keras.Input(shape=(300, 300, 3))    # input layer for CNN
embLabels = keras.Input(shape=(300,))        # input layer with the avg label word embedding

# image embedding
imgFeatures = cnnModel(inputImg)

# image and word embeddings concatenation to build a 600 dimensional layer
concat = keras.layers.Concatenate(axis=1)([imgFeatures, embLabels])

In [21]:
concat.get_shape


<bound method Tensor.get_shape of <tf.Tensor 'concatenate_3/concat:0' shape=(?, 600) dtype=float32>>

### Attention based LSTM  to finish up encoder

> Bench this for now 

In [None]:
# # attention block from 
# # https://github.com/philipperemy/keras-attention-mechanism/blob/master/attention_lstm.py#L15
# def attention_3d_block(inputs):
#     # inputs.shape = (batch_size, time_steps, input_dim)
#     input_dim = int(inputs.shape[2])
#     a = Permute((2, 1))(inputs)
#     a = Reshape((input_dim, TIME_STEPS))(a) # this line is not useful. It's just to know which dimension is what.
#     a = Dense(TIME_STEPS, activation='softmax')(a)
#     if SINGLE_ATTENTION_VECTOR:
#         a = Lambda(lambda x: K.mean(x, axis=1), name='dim_reduction')(a)
#         a = RepeatVector(input_dim)(a)
#     a_probs = Permute((2, 1), name='attention_vec')(a)
#     output_attention_mul = merge([inputs, a_probs], name='attention_mul', mode='mul')
#     return output_attention_mul

----

## Decoder
### LSTM

In [None]:
maxLengthCaption = 100
decoder = RepeatVector(maxLengthCaption)(concat)
decoder = Embedding()

In [None]:
# https://github.com/anuragmishracse/caption_generator/blob/master/caption_generator/caption_generator.py

# def create_model(self, ret_model = False):
#     #base_model = VGG16(weights='imagenet', include_top=False, input_shape = (224, 224, 3))
#     #base_model.trainable=False
#     image_model = Sequential()
#     #image_model.add(base_model)
#     #image_model.add(Flatten())
#     image_model.add(Dense(EMBEDDING_DIM, input_dim = 4096, activation='relu'))

#     image_model.add(RepeatVector(self.max_cap_len))

#     lang_model = Sequential()
#     lang_model.add(Embedding(self.vocab_size, 256, input_length=self.max_cap_len))
#     lang_model.add(LSTM(256,return_sequences=True))
#     lang_model.add(TimeDistributed(Dense(EMBEDDING_DIM)))

#     model = Sequential()
#     model.add(Merge([image_model, lang_model], mode='concat'))
#     model.add(LSTM(1000,return_sequences=False))
#     model.add(Dense(self.vocab_size))
#     model.add(Activation('softmax'))

#     print "Model created!"

#     if(ret_model==True):
#         return model

#     model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
#     return model

---
## Final model build

In [28]:
fullModel = keras.Model(inputs=[inputImg, embLabels], outputs=concat)
fullModel.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            (None, 300, 300, 3)  0                                            
__________________________________________________________________________________________________
model_2 (Model)                 (None, 300)          22417484    input_9[0][0]                    
__________________________________________________________________________________________________
input_10 (InputLayer)           (None, 300)          0                                            
__________________________________________________________________________________________________
concatenate_9 (Concatenate)     (None, 600)          0           model_2[3][0]                    
                                                                 input_10[0][0]                   
Total para

In [30]:
# compile the model (should be done *after* setting layers to non-trainable)
fullModel.compile(optimizer='rmsprop', loss='categorical_crossentropy')

In [31]:
# use generator to fit model


In [26]:
# # train the model on the new data for a few epochs
# model.fit_generator(...)

# # at this point, the top layers are well trained and we can start fine-tuning
# # convolutional layers from inception V3. We will freeze the bottom N layers
# # and train the remaining top layers.

# # let's visualize layer names and layer indices to see how many layers
# # we should freeze:
# for i, layer in enumerate(base_model.layers):
#    print(i, layer.name)

# # we chose to train the top 2 inception blocks, i.e. we will freeze
# # the first 249 layers and unfreeze the rest:
# for layer in model.layers[:249]:
#    layer.trainable = False
# for layer in model.layers[249:]:
#    layer.trainable = True

# # we need to recompile the model for these modifications to take effect
# # we use SGD with a low learning rate
# from keras.optimizers import SGD
# model.compile(optimizer=SGD(lr=0.0001, momentum=0.9), loss='categorical_crossentropy')

# # we train our model again (this time fine-tuning the top 2 inception blocks
# # alongside the top Dense layers
# model.fit_generator(...)