In [1]:
import os
import json
import h5py
import random
import nltk.tokenize
import numpy as np

In [9]:
def load_json(json_path):
    with open(json_path, 'r') as f:
        json_file = json.load(f)
    return json_file

def write_json(json_path, json_file):
    with open(json_path, 'w') as f:
        json.dump(json_file, f)


def process_caption(caption):
    """Insert start/end token into a list of tokenized words in caption"""
    
    tokenized_caption = ['<S>']
    tokenized_caption.extend(nltk.tokenize.word_tokenize(caption.lower()))
    tokenized_caption.append('<E>')
    return tokenized_caption
    
    
def create_vocab(captions, min_word_count=3):
    """Construct vocabulary
    
    Args:
        captions: A list of caption annotations 
                    {
                    'image_id': 'int',
                    'image_path': 'string',
                    'captions: 'list'
                    }
    """
    
    # tokenize the captions
    for cap_ann in captions:
        cap_ann['tokenized_captions'] = []
        for cap in cap_ann['captions']:
            cap_ann['tokenized_captions'].append(
                    process_caption(cap['caption']))
            
    # count the words in all captions
    word_count = {}
    for cap_ann in captions:
        for cap in cap_ann['tokenized_captions']:
            for word in cap:
                word_count[word] = word_count.get(word, 0) + 1
    print('Total words:', len(word_count))
    
    # discard the words where the number is smaller than min_word_count
    vocab = [x for x in word_count.items() if x[1] >= min_word_count]
    vocab.sort(key=lambda x: x[1], reverse=True)
    vocab = [x[0] for x in vocab]
    vocab.insert(0, '<UNK>')
    print('Words in vocabulary:', len(vocab))
    print('Top 20 words:\n', vocab[:20])
        
    # contruct the vocabulary dictionary
    wtoi = {w:i for i,w in enumerate(vocab)}
    itow = {i:w for i,w in enumerate(vocab)}
    
    return vocab, wtoi, itow

def count_captions(captions):
    """ Count the number of captions """
    num_caption = 0
    for cap_ann in captions:
        cap_ann['tokenized_captions'] = []
        for cap in cap_ann['captions']:
            cap_ann['tokenized_captions'].append(
                    process_caption(cap['caption']))
            num_caption += 1
    print('Number of captions:', num_caption)
    return num_caption

def encode_annotations(anns, vocab, wtoi, itow, datatype, max_caption_length=15):
    """ Encode annotations and save them
    Args:
        anns: annotations, 'list of dictionaries'
        vocab: vocabulary, 'list'
        wtoi: dictionary of word to index of vocabulary
        itow: dictionary of index to word of vocabulary
        datatype: data type (train|test), 'string'
        max_caption_length : maximum length of each caption when converting to the label
    """
    
    num_captions = count_captions(anns)
    caption_length = np.zeros(num_captions, dtype='uint32')
    caption_labels = np.zeros((num_captions, max_caption_length), dtype='uint32')
    caption_ids = []
    image_ids = []
    image_file_name = []
    
    idx_cap  = 0
    for ann in anns:
        img_id = ann['image_id']
        img_path = ann['image_path']
        
        if datatype == 'train':
            for cap, info in zip(ann['tokenized_captions'], ann['captions']):
                # numpy data
                caption_length[idx_cap] = min(len(cap), max_caption_length)
                for i,w in enumerate(cap):
                    if i == (max_caption_length-1):
                        caption_labels[idx_cap, i] = wtoi['<E>']
                        break
                    if w in vocab:
                        caption_labels[idx_cap, i] = wtoi[w]
                    else:
                        caption_labels[idx_cap, i] = wtoi['<UNK>']

                # json data
                caption_ids.append(info['id'])
                image_file_name.append(img_path)
                image_ids.append(img_id)

                idx_cap += 1
        else:
            idx = random.randint(0, len(ann['tokenized_captions'])-1)
            cap, info = ann['tokenized_captions'][idx], ann['captions'][idx]
            # numpy data
            caption_length[idx_cap] = min(len(cap), max_caption_length)
            for i,w in enumerate(cap):
                if i == (max_caption_length-1):
                    caption_labels[idx_cap, i] = wtoi['<E>']
                    break
                if w in vocab:
                    caption_labels[idx_cap, i] = wtoi[w]
                else:
                    caption_labels[idx_cap, i] = wtoi['<UNK>']

            # json data
            caption_ids.append(info['id'])
            image_file_name.append(img_path)
            image_ids.append(img_id)           
            
            idx_cap += 1
    
    
    # save the numpy data
    save_h5py_path = 'resource/%s.h5' % datatype
    f = h5py.File(save_h5py_path, 'w')
    f.create_dataset('caption_labels', dtype='uint32',
                    data=caption_labels)
    f.create_dataset('caption_length', dtype='uint32',
                    data=caption_length)
    f.close()
    print('Save %s h5py file to %s' % (datatype, save_h5py_path))
    
    # save the list data
    save_json_path = 'resource/%s.json' % datatype
    save_json_file = {}
    save_json_file['wtoi'] = wtoi
    save_json_file['itow'] = itow
    save_json_file['caption_ids'] = caption_ids
    save_json_file['image_ids'] = image_ids
    save_json_file['image_path'] = image_file_name
    save_json_file['max_caption_length'] = max_caption_length
    write_json(save_json_path, save_json_file)
    print('Save %s json file to %s' % (datatype, save_json_path))
    print('The number of saved captions: %d\n' % len(save_json_file['image_ids']))

### Load MSCOCO Caption Annotations for train/test

In [3]:
train_info_file = 'resource/train_caption_info.json'
test_info_file = 'resource/test_caption_info.json'

train_info = load_json(train_info_file)
test_info = load_json(test_info_file)

### Encoding annotations

In [5]:
# Construct vocabulary for training data
vocab, wtoi, itow = create_vocab(train_info)

('Total words:', 7353)
('Words in vocabulary:', 3244)
('Top 20 words:\n', ['<UNK>', u'a', '<S>', '<E>', u'.', u'on', u'of', u'the', u'in', u'with', u'and', u'is', u'man', u'to', u'sitting', u'an', u'two', u',', u'at', u'standing'])


In [10]:
# encode labels and lengths of captions
encode_annotations(train_info, vocab, wtoi, itow, 'train')
encode_annotations(test_info, vocab, wtoi, itow, 'test')

('Number of captions:', 25010)
Save train h5py file to resource/train.h5
Save train json file to resource/train.json
The number of saved captions: 25010

('Number of captions:', 5001)
Save test h5py file to resource/test.h5
Save test json file to resource/test.json
The number of saved captions: 1000

