<hr style='height:3pt'>

# Meme Caption Generator

<hr style='height:3pt'>

### Imports

In [1]:
import keras
from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing import image
from keras.models import Model, Sequential
from keras.layers import Dense, GlobalAveragePooling2D
from keras.layers import LSTM, Embedding, TimeDistributed, RepeatVector, Activation, Flatten
from keras.optimizers import RMSprop
import numpy as np
import pickle
import sys
import os
import cv2
from pickle_utils import pickle_load, pickle_dump

Using TensorFlow backend.


### Load Embeddings

In [2]:
# load embeddings
# glove_index_dict, embeddings = pickle_load("../Anthony/glove_objs.pkl")
embeddings, idx2word, word2idx, glove_idx2idx, captions = pickle_load("processed_data.pkl")

In [3]:
# input parameters
img_shape = (300,300,3)
vocab_size = embeddings.shape[0]
embedding_size = 300
maxlen = 30                         # maximum length of the caption in hidden state

hidden_units = embedding_size       # length of word vectors i.e. embedding size

# hyper params
clip_norm = 1.0
learning_rate = 0.1

In [4]:
def build_model(img_shape,
                  vocab_size, 
                  embedding_size, 
                  maxlen, 
                  hidden_units, 
                  clip_norm,
                  learning_rate):
    
    # =============
    #   ENCODER
    # =============
    # Inputs: 
    #     1. Image (300, 300, 3)
    #     2. GloVe-ed label embeddings (300,)

    # 1. Image Input
    input_img = keras.Input(shape=img_shape)
    # 2. Label Embedding
    label_emb = keras.Input(shape=(300,))

    # Model:
    #     1. Pretrained CNN with classification layer peeled off
    #           - Output size: (2048,)
    #     2. Concatenate with label embeding of size
    #           -  Output size: (2348,)
    #     3. MLP: Dense layer with 300 nodes
    #           - Output size: (300,) <-- This is the image embedding
    
    
    # 1. Define Pretrained CNN - Inception V3
    cnnModel = InceptionV3(weights='imagenet', 
                           include_top=False,        # this removes the final dense layer
                           input_shape=img_shape, 
                           pooling = 'avg')

    # freeze all convolutional InceptionV3 layers
    for layer in cnnModel.layers:
        layer.trainable = False

    # Get image embedding <- this is a model output
    image_emb = cnnModel(input_img)

    # 2. Concatenate image embedding with label embedding
    concat = keras.layers.Concatenate(axis=1)([image_emb, label_emb])
    
    # 3. MLP with 300 nodes
    full_img_embedding = Dense(300, activation='relu')(concat)

    # Attention Implementation
    #     1. MLP with 300 nodes (so weights can be learned) w *softmax* activation to get importance probability
    #     2. Multiply output of the full_img_embedding layer (Model part 3.) with "probabilities"
    
    # 1. MLP with 300 nodes w softmax
    softmax_encoder = Dense(300, activation = 'softmax', name = 'softmax_encoder')(full_img_embedding)
    
    # 2. Multiply layer i.e. probability weighted vector for naive impl of attention
    attention_encoder = keras.layers.Multiply()([full_img_embedding, softmax_encoder])

    # ==== ENCODER MODEL ====
    encoder = keras.Model(inputs=[input_img, label_emb], outputs=attention_encoder)

    # =============
    #   DECODER
    # =============
    # Inputs: 
    #     1. Caption (tokenized) (20,) <- limiting caption length to 20
    #     2. LSTM hidden state from encoder
    
    # 1. Caption
    input_caption = keras.Input(shape = (maxlen,))
    
    # 2. Input for the LSTM hidden state and/or cell state
    initial_state_LSTM = encoder([input_img, label_emb])
    
    # Model:
    #     1. Embedding layer that uses the GloVe embedding matrix, and is set to be trainable
    #           - Output size: (20, 300)
    #     2. LSTM
    #           -  Output size: (20, 300)
    #     3. Time Distributed layer to apply Dense layer to all the time step outputs
    #           - Output size: (20, 40000)
    #     4. Activation of softmax to get values between 0 and 1
    #           - Output size: (20, 40000)
        
    # 1. Embedding layer
    decoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_size,
                                  input_length=maxlen, embeddings_regularizer = None,
                                  weights = [embeddings], name = 'caption_embeddings', 
                                  trainable = True)
    # 2. LSTM
    decoder_LSTM = LSTM(hidden_units, return_sequences=True, return_state=True)

    ## ===== Get embedding and LSTM outputs =====
    decoder_embedding_outputs = decoder_embedding(input_caption)
    decoder_LSTM_outputs, _ , _ = decoder_LSTM(decoder_embedding_outputs, 
                                          initial_state = [initial_state_LSTM,  # hidden state
                                                           initial_state_LSTM]) # cell state
    
    # 3. Time Distributed Layer
    time_distributed = TimeDistributed(Dense(vocab_size, name = 'timedistributed_1'))
    
    # 4. Softmax 
    activation = Activation('softmax')
    
    ## ===== Get time distributed and softmax output =====
    time_distributed_output = time_distributed(decoder_LSTM_outputs)
    decoder_outputs = activation(time_distributed_output)

    # ==============
    #   FULL MODEL
    # ==============   
    model= Model(inputs=[input_img, label_emb, input_caption], outputs=decoder_outputs)
    rmsprop = RMSprop(lr=learning_rate, clipnorm=clip_norm)
    model.compile(loss='categorical_crossentropy', optimizer=rmsprop)
    
    print(model.summary())
    
    return encoder, model ## can add to this function

In [5]:
meme_model = build_model(img_shape, vocab_size, embedding_size, maxlen, hidden_units, clip_norm, learning_rate)

Instructions for updating:
Colocations handled automatically by placer.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 30)           0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 300, 300, 3)  0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 300)          0                                            
__________________________________________________________________________________________________
caption_embeddings (Embedding)  (None, 30, 300)      34455600    input_4[0][0]                    
_____________________________________

In [6]:
def gen(Xd, Xh, batch_size=batch_size, nb_batches=None, model=None, seed=seed):
    c = nb_batches if nb_batches else 0
    while True:
        xds = []
        xhs = []
        if nb_batches and c >= nb_batches:
            c = 0
        new_seed = random.randint(0, sys.maxsize)
        random.seed(c+123456789+seed)
        for b in range(batch_size):
            t = random.randint(0,len(Xd)-1)
            
            #random shuffling of data
            xd = Xd[t]
            s = random.randint(min(maxlend,len(xd)), max(maxlend,len(xd)))
            xds.append(xd[:s])
            
            xh = Xh[t]
            s = random.randint(min(maxlenh,len(xh)), max(maxlenh,len(xh)))
            xhs.append(xh[:s])

        # undo the seeding before we yield inorder not to affect the caller
        c+= 1
        random.seed(new_seed)

        yield conv_seq_labels(xds, xhs)
        
def conv_seq_labels(xds, xhs, nflips=None, model=None):
    """description and hedlines are converted to padded input vectors. headlines are one-hot to label"""
    batch_size = len(xhs)
    
    x = [vocab_fold(lpadd(xd)+xh) for xd,xh in zip(xds,xhs)]  # the input does not have 2nd eos
    x = sequence.pad_sequences(x, maxlen=maxlen, value=empty, padding='post', truncating='post')
        
    y = np.zeros((batch_size, maxlenh, vocab_size))
    for i, xh in enumerate(xhs):
        xh = vocab_fold(xh) + [eos] + [empty]*maxlenh  # output does have a eos at end
        xh = xh[:maxlenh]
        y[i,:,:] = np_utils.to_categorical(xh, vocab_size)
        
    #The 3 inputs are description, summary starting with eos and a one-hot encoding of the summary categorical variables.
    return [x[:,:maxlend],x[:,maxlend:]], y

NameError: name 'batch_size' is not defined

In [7]:
# Create dictionary of image data
base_fp = os.getcwd() + '/../memes/'
image_dict = {}
for name, fp in zip(captions.image.unique(), captions.file_path.unique()):
    print(base_fp + fp)
    im = cv2.imread(base_fp + fp)
    assert im is not None # check that the image has been read correctly
    image_dict[name] = im

/Users/nicholasstern/Desktop/ac209b/2019-CS109B/final_project/E/nick/../memes/y-u-no.jpg
/Users/nicholasstern/Desktop/ac209b/2019-CS109B/final_project/E/nick/../memes/bad-luck-brian.jpg
/Users/nicholasstern/Desktop/ac209b/2019-CS109B/final_project/E/nick/../memes/willy-wonka.jpg
/Users/nicholasstern/Desktop/ac209b/2019-CS109B/final_project/E/nick/../memes/the-most-interesting-man-in-the-world.jpg
/Users/nicholasstern/Desktop/ac209b/2019-CS109B/final_project/E/nick/../memes/futurama-fry.jpg
/Users/nicholasstern/Desktop/ac209b/2019-CS109B/final_project/E/nick/../memes/success-kid.jpg
/Users/nicholasstern/Desktop/ac209b/2019-CS109B/final_project/E/nick/../memes/one-does-not-simply.jpg
/Users/nicholasstern/Desktop/ac209b/2019-CS109B/final_project/E/nick/../memes/first-world-problems.jpg
/Users/nicholasstern/Desktop/ac209b/2019-CS109B/final_project/E/nick/../memes/philosoraptor.jpg
/Users/nicholasstern/Desktop/ac209b/2019-CS109B/final_project/E/nick/../memes/grumpy-cat.jpg
/Users/nicholasst

AssertionError: 

In [24]:
captions.head()

Unnamed: 0,image,above_text,below_text,file_path,full_caption,full_pdded_caption
0,y u no,victoria,y u no tell us your secret?!,y-u-no.jpg,<sos> victoria <break> y u no tell us your sec...,"[0, 7524, 2, 67, 64, 19, 129, 271, 14, 35862, ..."
1,y u no,kony,y u no take justin bieber,y-u-no.jpg,<sos> kony <break> y u no take justin bieber ...,"[0, 687, 2, 67, 64, 19, 149, 305, 507, 1, 1, 1..."
2,y u no,ted,y u no tell us how you met their mother,y-u-no.jpg,<sos> ted <break> y u no tell us how you met t...,"[0, 1289, 2, 67, 64, 19, 129, 271, 76, 4, 778,..."
3,y u no,google,y u no let me finish typing?,y-u-no.jpg,<sos> google <break> y u no let me finish typi...,"[0, 1419, 2, 67, 64, 19, 177, 13, 1224, 76382,..."
4,y u no,universal remote,y u no work on universe?,y-u-no.jpg,<sos> universal remote <break> y u no work on ...,"[0, 10933, 3585, 2, 67, 64, 19, 200, 18, 34583..."


In [26]:
list(zip(captions.image.unique(), captions.file_path.unique()))

[('y u no', 'y-u-no.jpg'),
 ('bad luck brian', 'bad-luck-brian.jpg'),
 ('willy wonka', 'willy-wonka.jpg'),
 ('the most interesting man in the world',
  'the-most-interesting-man-in-the-world.jpg'),
 ('futurama fry', 'futurama-fry.jpg'),
 ('success kid', 'success-kid.jpg'),
 ('one does not simply', 'one-does-not-simply.jpg'),
 ('first world problems', 'first-world-problems.jpg'),
 ('philosoraptor', 'philosoraptor.jpg'),
 ('grumpy cat ', 'grumpy-cat-.jpg'),
 ('winter is coming', 'winter-is-coming.jpg'),
 ('forever alone', 'forever-alone.jpg'),
 ('good guy greg', 'good-guy-greg.jpg'),
 ('scumbag steve', 'scumbag-steve.jpg'),
 ('what if i told you', 'what-if-i-told-you.jpg'),
 ('kermit the frog drinking tea', 'kermit-the-frog-drinking-tea.jpg'),
 ('conspiracy keanu', 'conspiracy-keanu.jpg'),
 ('yo dawg', 'yo-dawg.jpg'),
 ('all the things', 'all-the-things.jpg'),
 ('insanity wolf', 'insanity-wolf.jpg'),
 ('joseph ducreux', 'joseph-ducreux.jpg'),
 ('trollface', 'trollface.jpg'),
 ('pedobear'