<hr style='height:3pt'>

# Meme Caption Generator

<hr style='height:3pt'>

In [1]:
import keras
from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing import image
from keras.models import Model, Sequential
from keras.layers import Dense, GlobalAveragePooling2D
from keras.layers import LSTM, Embedding, TimeDistributed, RepeatVector, Activation, Flatten
from keras.optimizers import RMSprop
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
import numpy as np
import pickle
import sys
import os
import cv2
from pickle_utils import pickle_load, pickle_dump
import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:
# Check GPU
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0']

# Building Model Architecture

In [3]:
# load embeddings
embeddings, idx2word, word2idx, captions = pickle_load("full_clean_processed_data.pkl")
captions.image = captions.image.apply(lambda x: x.strip(' '))
#captions = captions.sample(frac=.5, random_state=19).reset_index(drop=True)  # Downsample for training

In [4]:
# input parameters
img_shape = (300,300,3)
vocab_size = embeddings.shape[0]
embedding_size = 300
maxlen = 20                         # maximum length of the caption in hidden state
batch_size = 32
hidden_units = embedding_size       # length of word vectors i.e. embedding size

# hyper params
clip_norm = 1.0
learning_rate = 0.01

In [5]:
def build_model(img_shape, vocab_size, embedding_size, maxlen, hidden_units, clip_norm, learning_rate):
    '''
    =============
      ENCODER
    =============
    Inputs: 
        1. Image (300, 300, 3)
        2. GloVe-ed label embeddings (300,)
        
    Model:
        3. Pretrained CNN with classification layer peeled off
              - Output size: (2048,)
        4. Concatenate with label embeding of size
              - Output size: (2348,)
        5. MLP: Dense layer with 300 nodes
              - Output size: (300,) <-- This is the image embedding
    '''

    # 1. Image Input
    input_img = keras.Input(shape=img_shape, name='image_input')
    
    # 2. Label Embedding
    label_emb = keras.Input(shape=(300,), name='image_label_input')

    # 3. Define Pretrained CNN - Inception V3
    cnnModel = InceptionV3(weights='imagenet', 
                           include_top=False,        # this removes the final dense layer
                           input_shape=img_shape, 
                           pooling = 'avg')

    # freeze all convolutional InceptionV3 layers
    for layer in cnnModel.layers:
        layer.trainable = False

    # Get image embedding <- this is a model output
    image_emb = cnnModel(input_img)

    # 4. Concatenate image embedding with label embedding
    concat = keras.layers.Concatenate(axis=1)([image_emb, label_emb])
    
    # 5. MLP with 300 nodes
    full_img_embedding = Dense(300, activation='relu')(concat)

    # ==== ENCODER MODEL ====
    encoder = keras.Model(inputs=[input_img, label_emb], outputs=full_img_embedding)
    

    '''
    =============
      DECODER
    =============
    Inputs: 
        8. Caption (tokenized) (20,) <- limiting caption length to 20
        9. LSTM hidden state from encoder
        
    Model:
        10. Embedding layer that uses the GloVe embedding matrix, and is set to be trainable
              - Output size: (20, 300)
        11. LSTM
              -  Output size: (20, 300)
        12. Time Distributed layer to apply Dense layer to all the time step outputs
              - Output size: (20, 40000)
        13. Activation of softmax to get values between 0 and 1
              - Output size: (20, 40000)
    '''

    # 8. Caption
    input_caption = keras.Input(shape = (maxlen,), name='image_caption_input')
    
    # 9. Input for the LSTM hidden state and/or cell state
    initial_state_LSTM = encoder([input_img, label_emb])
    

        
    # 10. Embedding layer
    decoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_size,
                                  input_length=maxlen, embeddings_regularizer = None,
                                  weights = [embeddings], name = 'caption_embeddings', 
                                  trainable = True, mask_zero=True)
    # 11. LSTM
    decoder_LSTM = LSTM(hidden_units, return_sequences=True, return_state=True)

    ## ===== Get embedding and LSTM outputs =====
    decoder_embedding_outputs = decoder_embedding(input_caption)
    decoder_LSTM_outputs, _ , _ = decoder_LSTM(decoder_embedding_outputs, 
                                          initial_state = [initial_state_LSTM,  # hidden state
                                                           initial_state_LSTM]) # cell state
    
    # 12. Time Distributed Layer
    time_distributed = TimeDistributed(Dense(vocab_size, name = 'timedistributed_1'))
    
    # 13. Softmax 
    activation = Activation('softmax')
    
    ## ===== Get time distributed and softmax output =====
    time_distributed_output = time_distributed(decoder_LSTM_outputs)
    decoder_outputs = activation(time_distributed_output)

    # ==============
    #   FULL MODEL
    # ==============   
    model= Model(inputs=[input_img, label_emb, input_caption], outputs=decoder_outputs)
    rmsprop = RMSprop(lr=learning_rate, clipnorm=clip_norm, decay=.7)
    model.compile(loss='categorical_crossentropy', optimizer=rmsprop)
    
    print(model.summary())
    
    return model ## can add to this function

In [6]:
meme_model = build_model(img_shape, vocab_size, embedding_size, maxlen, hidden_units, clip_norm, learning_rate)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
image_caption_input (InputLayer (None, 20)           0                                            
__________________________________________________________________________________________________
image_input (InputLayer)        (None, 300, 300, 3)  0                                            
__________________________________________________________________________________________________
image_label_input (InputLayer)  (None, 300)          0                                            
__________________________________________________________________________________________________
caption_embeddings (Embedding)  (None, 20, 300)      11012700    image_caption_input[0][0]        
__________________________________________________________________________________________________
model_1 (M

In [7]:
# Create dictionary of image data to store in cache
base_fp = os.getcwd() + '/../memes/'
image_dict = {}
for name, fp in zip(captions.image.unique(), captions.file_path.unique()):
    im = cv2.imread(base_fp + fp)
    assert im is not None # check that the image has been read correctly
    image_dict[name] = im

# Data Generation

In [8]:
def data_gen(df, embeddings, word2idx, image_dict, batch_size=32, im_dim=(300, 300, 3)):
    '''
    Data generator
    
    Inputs:
        df - Pandas dataframe with caption information
        embeddings - matrix of embeddings to map from word indices
        word2idx - matrix to convert words to indices (for image labels)
        image_dict - dictionary containing the image pixel data, keys are labels
        
    Outputs: (batch of batch_size)
        images - batch of pre-processed images
        label_embs - batch of averaged image label embeddings
        caption_inds - batch of caption indices 
        targets - batch of sequences of one-hot encoded sparse vocab vectors 
        
    '''
    while 1:  # needed for keras generator
        # Shuffle data
        df_new = df.sample(frac=1).reset_index(drop=True)

        # Split into batches
        split_ind = list(range(0, df_new.shape[0], batch_size))
        batches = np.array_split(df_new, split_ind[1:])
        for i, batch in enumerate(batches):
            
            # Prepare matrices to hold data
            images = np.zeros((batch.shape[0], im_dim[0], im_dim[1], im_dim[2]))
            label_embs = np.zeros((batch.shape[0], embeddings.shape[1]))
            caption_inds = np.zeros((batch.shape[0], maxlen))
            targets = np.zeros((batch.shape[0], maxlen, vocab_size))
            
            for j, (_, row) in enumerate(batch.iterrows()):
                ### Prepare Image Data ###
                im_data = image_dict[row.image]  # get image data for batch
                
                if im_data.shape != im_dim:  # resize if not 300 x 300
                    im_data = cv2.resize(im_data, (im_dim[0], im_dim[1])) 
                    
                im_data = im_data/255  # normalize
                im_data = im_data.astype(np.float32)  # convert to single-precision
                images[j] = im_data  # save the image
                
                ### Prepare Image Labels ###
                im_label_words = row.image.split(' ')
                im_label_ind = [word2idx[word] for word in im_label_words]
                im_label_emb = [embeddings[ind] for ind in im_label_ind]
                im_avg_emb = np.mean(im_label_emb, axis=0)  # average embedding
                label_embs[j] = im_avg_emb
                
                ### Prepare Caption Indices ###
                caption_ind = row.full_padded_caption
                caption_inds[j] = caption_ind 
                
                ### Prepare Target ###
                target_ind = caption_ind[1:]  # target index is right shifted version of caption
                #target_ind.append(1)  # add an extra eos
                target_ind.append(0)  # add an extra eos
                target = to_categorical(target_ind, num_classes=vocab_size)  # matrix of max_len x vocab size
                targets[j] = target
                
                
            yield [images, label_embs, caption_inds], targets

In [9]:
# Set aside test base image and associated captions
test_image = captions.sample(1, random_state=19).image.values[0]
test_ind = np.where(captions.image == test_image)[0]
train_data = captions.drop(test_ind)
test_data = captions.iloc[test_ind]

In [10]:
# Early stopping
es = EarlyStopping(monitor='loss', mode='min', verbose=2, patience=0)

In [None]:
history = meme_model.fit_generator(data_gen(train_data, embeddings, word2idx, image_dict, batch_size=32), 
                        steps_per_epoch=np.ceil(train_data.shape[0]//batch_size), 
                        epochs=10,
                        verbose=1, 
                        callbacks=[es])

In [12]:
# Save model if good
#meme_model.save('meme_model_v2.h5')

<hr style='height:3pt'>

# Inference

<hr style='height:3pt'>

In [13]:
def inference(model, image, im_label, idx2word, greedy=True, k=3):
    '''
    Takes model predicted probabilities and converts to text
    
    inputs:
        preds - vector of probability distributions over vocabulary
        image - base image to predict for
        im_label - image label associated with the base image (string)
        idx2word - map of indices to words
        
    outputs:
        caption - predicted caption text
    '''
    # preprocessing
    image = np.expand_dims(image, axis=0)
    im_label = np.expand_dims(imlabel_to_emb(im_label, embeddings, word2idx), axis=0)
    caption = np.zeros((1, 20))
    result = []
    
    for i in range(maxlen):
        # make a prediction
        preds = model.predict([image, im_label, caption])
        
        if greedy:  # implement greedy search
            ind = np.argmax(preds[0, i])
            
        else:  # implement beam search
            top_k_idx = np.argsort(preds[0, i])[-k:]
            #ind = np.random.choice(top_k_idx) # unweighted
            weights = sorted(preds[0, i])[-k:]
            norm_weights = weights/np.sum(weights)
            ind = np.random.choice(top_k_idx, p=norm_weights)
            
        caption[0, i] = ind
        result.append(idx2word[ind])
    return result

    
def imlabel_to_emb(label, embeddings, word2idx):
    '''
    Converts an image label to its average embedding
    '''
    words = label.split(' ')
    word_inds = [word2idx[word] for word in words]
    word_embs = [embeddings[ind] for ind in word_inds]
    
    avg_label_emb = np.mean(word_embs, axis=0)
    return avg_label_emb

In [None]:
import matplotlib.pyplot as plt
#meme_model.load('meme_model_v1') # load model

filepath = os.getcwd() + '/../base_images/pavlos.png'
pred_image = cv2.imread(filepath) # pavlos
assert pred_image is not None  # make sure image gets read in
# pred_image = image_dict[test_image]
pred_image = cv2.resize(pred_image, (300, 300))
plt.imshow(pred_image)
plt.show()
#im_label = test_image
im_label = input()
inference(meme_model, pred_image, im_label, idx2word, greedy=True)

<hr style='height:3pt'>

# Evaluation

<hr style='height:3pt'>

There exists a large compendium of techniques to evaluate the similarity between a machine generated caption and a human generated caption. Typically the similarity is computed using a **candidate sentence** generated by an ML algorithm and a **reference sentence** (or multiple) generated by a human. A few examples include:
- **BLEU (2002)**
    - At its core, BLEU is the precision of the candidate sentence, a.k.a, the proportion of words in the candidate sentence that also appear in the reference sentence. It extends to doing multiple n-gram comparisons and taking a weighted average. A more thorough description and example implementation in python can be found [here](https://machinelearningmastery.com/calculate-bleu-score-for-text-python/). Extensions to this method penalize candidate sentences that are shorter than the reference sentence.  
    
    
- **ROUGE (2004)**
    - The recall of the candidate sentence. The proportion of words in the reference sentence that also appear in the candidate sentence. It's essentially the complement to BLEU, and they are often combined in a reported F1 score. Read more [here](https://stackoverflow.com/questions/38045290/text-summarization-evaluation-bleu-vs-rouge)
    
    
- **METEOR (2005)**
    - An extension to the precision/recall combo that algorithmically finds a mapping between the candidate text and the reference text, then uses that to compute the score. Wikipedia says "Results have been presented which give correlation of up to 0.964 with human judgement at the corpus level, compared to BLEU's achievement of 0.817 on the same data set." This method also factors in synonyms. [source](https://en.wikipedia.org/wiki/METEOR)
    
    
- **CIDEr (2015)**
    - This method was developed specifically for image captioning, and extends the previous methods by doing a TF-IDF weighting before comparing the co-occurrence of n-grams between the candidate and reference sentence (actually a set of sentences typically). It is not always effective in situations where it adds disporportionate weight to unimportant words in a sentence that occur infrequently. [source](https://en.wikipedia.org/wiki/METEOR)
    

- **WMD (2015)**
    - Uses word embeddings and something similar to Wasserstein distance to compute the discrepancy between a candidate sentence and a reference sentence. This snares the semantic similarities between two sentences that may not share commong words or even synonyms. [Here](https://vene.ro/blog/word-movers-distance-in-python.html) is a python blog post about it.
    
    
- **SPICE (2016)**
    - SPICE breaks down sentences into semantically meaningful components such as objects, attributes, and relation types. This graph structure is then used to create pairs of words that are semantically related, and computes and F1 score for the tuples between the candidate and the reference sentence(s). [This](https://aclweb.org/anthology/E17-1019) paper does a good job of summarizing this and all the above metrics.
    
    
The paper linked [here](https://aclweb.org/anthology/E17-1019) does a phenomenal job of providing visual and tabular comparisons of each of the aforementioned metrics. The paper also examines their correlation with each other, concluding that the n-gram metrics (BLEU, ROUGE, METEOR, CIDEr) can complement the embedding (WMD) and graph-based (SPICE) ones. Here is a table and figure from the paper:

![](nlp_metrics.png)


We decided to use BLEU, mainly because it is easily transferrable to vocabularies with lots of slang words, and has a well documented and supported python implementation as a part of the NLTK library. 

# BLEU Example

In [None]:
# 4-gram cumulative BLEU
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
smoother = SmoothingFunction()  
reference = [['this', 'is', 'small', 'test']]  # Corpus of documents
candidate = ['this', 'is', 'a', 'test']  # Candidate document
score = sentence_bleu(reference, candidate, 
                      smoothing_function=smoother.method4, 
                      weights=(0.25, 0.25, 0.25, 0.25))  # ngram weights
print(score)