# Image Caption  
I'll use pretrained coco dataset to train this image caption model  
the dataset is in [http://www.cs.toronto.edu/~vendrov/order/coco.zip](http://www.cs.toronto.edu/~vendrov/order/coco.zip)

In [1]:
import numpy as np
import os
import matplotlib.pyplot as plt

%matplotlib inline

## Load pretrained vgg-19 image data   
The data is in 4096 dimension

In [2]:
path = './data/coco/images/10crop'
train_data = np.load( os.path.join(path, 'train.npy'))
val_data = np.load( os.path.join(path, 'val.npy'))
test_data = np.load( os.path.join(path, 'test.npy'))

In [3]:
print('train data shape: {}'.format(train_data.shape))
print('validation data shape: {}'.format(val_data.shape))
print('test data shape: {}'.format(test_data.shape))

train data shape: (113287, 4096)
validation data shape: (5000, 4096)
test data shape: (5000, 4096)


## Load the caption

In [42]:
def read_text(file_path):
    content = []
    max_length = 0
    with open(file_path, 'rb') as f:
        for line in f:
            content.append(line.strip())
            if len(line.strip().split()) > max_length:
                max_length = len(line.strip().split())
    return np.array(content), max_length

In [43]:
text_path = './data/coco'
train_cap, train_max_length = read_text(os.path.join(text_path, 'train.txt'))
val_cap, val_max_length = read_text(os.path.join(text_path, 'val.txt'))
test_cap, test_max_length = read_text(os.path.join(text_path, 'test.txt'))

In [44]:
print('train caption shape: {0}, max length: {1}'.format(train_cap.shape, train_max_length))
print('validation caption shape: {0}, max length: {1}'.format(val_cap.shape, val_max_length))
print('test caption shape: {0}, max length: {1}'.format(test_cap.shape, test_max_length))

train caption shape: (566435,), max length: 49
validation caption shape: (25000,), max length: 47
test caption shape: (25000,), max length: 43


In [14]:
train_cap[:10]

array([b'a woman wearing a net on her head cutting a cake',
       b'a woman cutting a large white sheet cake',
       b'a woman wearing a hair net cutting a large sheet cake',
       b'there is a woman that is cutting a white cake',
       b'a woman marking a cake with the back of a chefs knife',
       b'a young boy standing in front of a computer keyboard',
       b'a little boy wearing headphones and looking at a computer monitor',
       b'he is listening intently to the computer at school',
       b'a young boy stares up at the computer monitor',
       b'a young kid with head phones on using a computer'],
      dtype='|S246')

each image related to 5 captions

## Prepare caption

In [8]:
from collections import Counter
def get_vocab_int(text):
    text = text.lower()
    vocab = sorted(set(text.split()))
    vocab_counter = Counter(vocab)

    vocab = ['<PAD>','<EOS>','<UNK>','<GO>'] + vocab
    vocab_to_int = {word: index for index, word in enumerate(vocab)}
    int_to_vocab = {index: word for word, index in vocab_to_int.items()}
    return vocab_to_int, int_to_vocab, vocab_counter

In [45]:
# the input should be a list of sentences 
# and the output is the corresponding int value with the same length as the max length of the sentence
# the output of each starts with the <GO> and ends with <EOS>, with <PAD> to fill the empty places
def get_cap_id(sentences, vocab_to_int, max_length):
    output = []
    for sentence in sentences:
        sentence = sentence.decode('utf-8')
        new_line = [vocab_to_int.get(word, vocab_to_int['<UNK>']) for word in sentence.split()]
        new_line.insert(0, vocab_to_int['<GO>'])
        new_line.append(vocab_to_int['<EOS>'])
        output.append(new_line)
    output = [sentence + [vocab_to_int['<PAD>']] * (max_length - len(sentence))
                    for sentence in output]
    return output

In [17]:
# get all the text to generate the dictionary
total_text = ''
for sentence in train_cap:
    total_text += sentence.decode('utf-8')
for sentence in val_cap:
    total_text += sentence.decode('utf-8')
for sentence in test_cap:
    total_text += sentence.decode('utf-8')

In [18]:
vocab_to_int, int_to_vocab, vocab_counter = get_vocab_int(total_text)

#### Test one

In [23]:
test = get_cap_id(train_cap[:5], vocab_to_int)

In [26]:
print(test[0])

[3, 696, 90544, 89111, 696, 50817, 52237, 37367, 36987, 21913, 696, 13234, 1]


In [25]:
print([int_to_vocab[i] for i in test[0]])

['<GO>', 'a', 'woman', 'wearing', 'a', 'net', 'on', 'her', 'head', 'cutting', 'a', 'cake', '<EOS>']


## Build the model

In [None]:
import keras
from keras.layers import Input, Embedding, LSTM, Dense
from keras.models import Model

### Some parameters

In [None]:
img_input_dim = 4096
max_length = max(train_max_length, val_max_length, test_max_length)
cap_input_dim = len(vocab_to_int)
cap_embed_dim = 512

### image input

In [None]:
image_input = Input(shape=(4096,), name='image_input')

### caption input

In [None]:
cap_input = Input(shape=(max_length,), dtype='int32', name='cap_input')
x = Embedding(output_dim=cap_embed_dim, input_dim=cap_input_dim, input_length=max_length)(cap_input)
lstm_out = LSTM(32)(x)

### concatenate the image input and LSTM input

In [None]:
x = keras.layers.concatenate([lstm_out, image_input])