<hr style='height:3pt'>

# Meme Caption Generator

<hr style='height:3pt'>

In [1]:
import keras
from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing import image
from keras.models import Model, Sequential
from keras.layers import Dense, GlobalAveragePooling2D
from keras.layers import LSTM, Embedding, TimeDistributed, RepeatVector, Activation, Flatten
from keras.optimizers import RMSprop
from keras.utils import to_categorical
import numpy as np
import pickle
import sys
import os
import cv2
from pickle_utils import pickle_load, pickle_dump
import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


# Building Model Architecture

In [2]:
# load embeddings
embeddings, idx2word, word2idx, captions = pickle_load("small_processed_data.pkl")
captions.image = captions.image.apply(lambda x: x.strip(' '))

In [3]:
# input parameters
img_shape = (300,300,3)
vocab_size = embeddings.shape[0]
embedding_size = 300
maxlen = 30                         # maximum length of the caption in hidden state
batch_size = 32
hidden_units = embedding_size       # length of word vectors i.e. embedding size

# hyper params
clip_norm = 1.0
learning_rate = 0.1

In [4]:
def build_model(img_shape, vocab_size, embedding_size, maxlen, hidden_units, clip_norm, learning_rate):
    '''
    =============
      ENCODER
    =============
    Inputs: 
        1. Image (300, 300, 3)
        2. GloVe-ed label embeddings (300,)
        
    Model:
        3. Pretrained CNN with classification layer peeled off
              - Output size: (2048,)
        4. Concatenate with label embeding of size
              - Output size: (2348,)
        5. MLP: Dense layer with 300 nodes
              - Output size: (300,) <-- This is the image embedding
    '''

    # 1. Image Input
    input_img = keras.Input(shape=img_shape, name='image_input')
    
    # 2. Label Embedding
    label_emb = keras.Input(shape=(300,), name='image_label_input')

    # 3. Define Pretrained CNN - Inception V3
    cnnModel = InceptionV3(weights='imagenet', 
                           include_top=False,        # this removes the final dense layer
                           input_shape=img_shape, 
                           pooling = 'avg')

    # freeze all convolutional InceptionV3 layers
    for layer in cnnModel.layers:
        layer.trainable = False

    # Get image embedding <- this is a model output
    image_emb = cnnModel(input_img)

    # 4. Concatenate image embedding with label embedding
    concat = keras.layers.Concatenate(axis=1)([image_emb, label_emb])
    
    # 5. MLP with 300 nodes
    full_img_embedding = Dense(300, activation='relu')(concat)
    
    '''
    =============
    ATTENTION IMPLEMENTATION
    =============
        6. MLP with 300 nodes (so weights can be learned) w softmax activation to get importance probabilities
        7. Multiply output of the full_img_embedding layer (Model part 3.) with "probabilities"
    '''
    
    # 6. MLP with 300 nodes w softmax
    softmax_encoder = Dense(300, activation = 'softmax', name = 'softmax_encoder')(full_img_embedding)
    
    # 7. Multiply layer i.e. probability weighted vector for naive impl of attention
    attention_encoder = keras.layers.Multiply()([full_img_embedding, softmax_encoder])

    # ==== ENCODER MODEL ====
    encoder = keras.Model(inputs=[input_img, label_emb], outputs=attention_encoder)

    '''
    =============
      DECODER
    =============
    Inputs: 
        8. Caption (tokenized) (20,) <- limiting caption length to 20
        9. LSTM hidden state from encoder
        
    Model:
        10. Embedding layer that uses the GloVe embedding matrix, and is set to be trainable
              - Output size: (20, 300)
        11. LSTM
              -  Output size: (20, 300)
        12. Time Distributed layer to apply Dense layer to all the time step outputs
              - Output size: (20, 40000)
        13. Activation of softmax to get values between 0 and 1
              - Output size: (20, 40000)
    '''

    # 8. Caption
    input_caption = keras.Input(shape = (maxlen,), name='image_caption_input')
    
    # 9. Input for the LSTM hidden state and/or cell state
    initial_state_LSTM = encoder([input_img, label_emb])
    

        
    # 10. Embedding layer
    decoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_size,
                                  input_length=maxlen, embeddings_regularizer = None,
                                  weights = [embeddings], name = 'caption_embeddings', 
                                  trainable = True, mask_zero=True)
    # 11. LSTM
    decoder_LSTM = LSTM(hidden_units, return_sequences=True, return_state=True)

    ## ===== Get embedding and LSTM outputs =====
    decoder_embedding_outputs = decoder_embedding(input_caption)
    decoder_LSTM_outputs, _ , _ = decoder_LSTM(decoder_embedding_outputs, 
                                          initial_state = [initial_state_LSTM,  # hidden state
                                                           initial_state_LSTM]) # cell state
    
    # 12. Time Distributed Layer
    time_distributed = TimeDistributed(Dense(vocab_size, name = 'timedistributed_1'))
    
    # 13. Softmax 
    activation = Activation('softmax')
    
    ## ===== Get time distributed and softmax output =====
    time_distributed_output = time_distributed(decoder_LSTM_outputs)
    decoder_outputs = activation(time_distributed_output)

    # ==============
    #   FULL MODEL
    # ==============   
    model= Model(inputs=[input_img, label_emb, input_caption], outputs=decoder_outputs)
    rmsprop = RMSprop(lr=learning_rate, clipnorm=clip_norm)
    model.compile(loss='categorical_crossentropy', optimizer=rmsprop)
    
    print(model.summary())
    
    return model ## can add to this function

In [5]:
meme_model = build_model(img_shape, vocab_size, embedding_size, maxlen, hidden_units, clip_norm, learning_rate)

Instructions for updating:
Colocations handled automatically by placer.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
image_caption_input (InputLayer (None, 30)           0                                            
__________________________________________________________________________________________________
image_input (InputLayer)        (None, 300, 300, 3)  0                                            
__________________________________________________________________________________________________
image_label_input (InputLayer)  (None, 300)          0                                            
__________________________________________________________________________________________________
caption_embeddings (Embedding)  (None, 30, 300)      2226000     image_caption_input[0][0]        
_____________________________________

In [6]:
# Create dictionary of image data to store in cache
base_fp = os.getcwd() + '/../memes/'
image_dict = {}
for name, fp in zip(captions.image.unique(), captions.file_path.unique()):
    im = cv2.imread(base_fp + fp)
    assert im is not None # check that the image has been read correctly
    image_dict[name] = im

# Data Generation

In [7]:
def data_gen(df, embeddings, word2idx, image_dict, batch_size=32, im_dim=(300, 300, 3)):
    '''
    Data generator
    
    Inputs:
        df - Pandas dataframe with caption information
        embeddings - matrix of embeddings to map from word indices
        word2idx - matrix to convert words to indices (for image labels)
        image_dict - dictionary containing the image pixel data, keys are labels
        
    Outputs: (batch of batch_size)
        images - batch of pre-processed images
        label_embs - batch of averaged image label embeddings
        caption_inds - batch of caption indices 
        targets - batch of sequences of one-hot encoded sparse vocab vectors 
        
    '''
    while 1:  # needed for keras generator
        # Shuffle data
        df_new = df.sample(frac=1).reset_index(drop=True)

        # Split into batches
        split_ind = list(range(0, df_new.shape[0], batch_size))
        batches = np.array_split(df_new, split_ind[1:])
        for i, batch in enumerate(batches):
            
            # Prepare matrices to hold data
            images = np.zeros((batch.shape[0], im_dim[0], im_dim[1], im_dim[2]))
            label_embs = np.zeros((batch.shape[0], embeddings.shape[1]))
            caption_inds = np.zeros((batch.shape[0], maxlen))
            targets = np.zeros((batch.shape[0], maxlen, vocab_size))
            
            for j, (_, row) in enumerate(batch.iterrows()):
                ### Prepare Image Data ###
                im_data = image_dict[row.image]  # get image data for batch
                
                if im_data.shape != im_dim:  # resize if not 300 x 300
                    im_data = cv2.resize(im_data, (im_dim[0], im_dim[1])) 
                    
                im_data = im_data/255  # normalize
                im_data = im_data.astype(np.float32)  # convert to single-precision
                images[j] = im_data  # save the image
                
                ### Prepare Image Labels ###
                im_label_words = row.image.split(' ')
                im_label_ind = [word2idx[word] for word in im_label_words]
                im_label_emb = [embeddings[ind] for ind in im_label_ind]
                im_avg_emb = np.mean(im_label_emb, axis=0)  # average embedding
                label_embs[j] = im_avg_emb
                
                ### Prepare Caption Indices ###
                caption_ind = row.full_padded_caption
                caption_inds[j] = caption_ind 
                
                ### Prepare Target ###
                target_ind = caption_ind[1:]  # target index is right shifted version of caption
                #target_ind.append(1)  # add an extra eos
                target_ind.append(0)  # add an extra eos
                target = to_categorical(target_ind, num_classes=vocab_size)  # matrix of max_len x vocab size
                targets[j] = target
                
                
            yield [images, label_embs, caption_inds], targets

In [8]:
history = meme_model.fit_generator(data_gen(captions, embeddings, word2idx, image_dict, batch_size=32), 
                        steps_per_epoch=np.ceil(captions.shape[0]//batch_size), 
                        epochs=1,
                        verbose=1)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/1


In [16]:
def inference(model, image, im_label, idx2word, greedy=True, k=3):
    '''
    Takes model predicted probabilities and converts to text
    
    inputs:
        preds - vector of probability distributions over vocabulary
        image - base image to predict for
        im_label - image label associated with the base image (string)
        idx2word - map of indices to words
        
    outputs:
        caption - predicted caption text
    '''
    # preprocessing
    image = np.expand_dims(image, axis=0)
    im_label = np.expand_dims(imlabel_to_emb(im_label, embeddings, word2idx), axis=0)
    caption = np.zeros((1, 30))
    result = []
    
    for i in range(maxlen):
        # make a prediction
        preds = model.predict([image, im_label, caption])
        
        if greedy:  # implement greedy search
            ind = np.argmax(preds[0, i])
            
        else:  # implement beam search
            top_k_idx = np.argsort(preds[0, i])[-k:]
            #ind = np.random.choice(top_k_idx) # unweighted
            weights = sorted(preds[0, i])[-k:]
            norm_weights = weights/np.sum(weights)
            ind = np.random.choice(top_k_idx, p=norm_weights)
            
        caption[0, i] = ind
        result.append(idx2word[ind])
    return result

    
def imlabel_to_emb(label, embeddings, word2idx):
    '''
    Converts an image label to its average embedding
    '''
    words = label.split(' ')
    word_inds = [word2idx[word] for word in words]
    word_embs = [embeddings[ind] for ind in word_inds]
    
    avg_label_emb = np.mean(word_embs, axis=0)
    return avg_label_emb

In [17]:
pred_image = list(image_dict.values())[0] # yo dawg meme
im_label = 'yo man'
inference(meme_model, pred_image, im_label, idx2word, greedy=False)

['you',
 'mean',
 '<break>',
 'you',
 'mean',
 '<eos>',
 'the',
 'car',
 '<break>',
 'i',
 'will',
 'find',
 'you',
 'and',
 'only',
 'you',
 '<eos>',
 'and',
 'i',
 'will',
 'kill',
 'you',
 '<eos>',
 'you',
 'and',
 'only',
 'you',
 'are',
 '<break>',
 'i']