In [39]:
import os
import json
import h5py
import nltk.tokenize
import numpy as np

In [41]:
def load_json(json_path):
    with open(json_path, 'r') as f:
        json_file = json.load(f)
    return json_file

def write_json(json_path, json_file):
    with open(json_path, 'w') as f:
        json.dump(json_file, f)


def process_caption(caption):
    """Insert start/end token into a list of tokenized words in caption"""
    
    tokenized_caption = ['<S>']
    tokenized_caption.extend(nltk.tokenize.word_tokenize(caption.lower()))
    tokenized_caption.append('<E>')
    return tokenized_caption
    
    
def create_vocab(captions, min_word_count=3):
    """Construct vocabulary
    
    Args:
        captions: A list of caption annotations 
                    {
                    'image_id': 'int',
                    'image_path': 'string',
                    'captions: 'list'
                    }
    """
    
    # tokenize the captions
    for cap_ann in captions:
        cap_ann['tokenized_captions'] = []
        for cap in cap_ann['captions']:
            cap_ann['tokenized_captions'].append(
                    process_caption(cap['caption']))
            
    # count the words in all captions
    word_count = {}
    for cap_ann in captions:
        for cap in cap_ann['tokenized_captions']:
            for word in cap:
                word_count[word] = word_count.get(word, 0) + 1
    print('Total words:', len(word_count))
    
    # discard the words where the number is smaller than min_word_count
    vocab = [x for x in word_count.items() if x[1] >= min_word_count]
    vocab.sort(key=lambda x: x[1], reverse=True)
    vocab = [x[0] for x in vocab]
    vocab.insert(0, '<UNK>')
    print('Words in vocabulary:', len(vocab))
    print('Top 20 words:\n', vocab[:20])
        
    # contruct the vocabulary dictionary
    wtoi = {w:i for i,w in enumerate(vocab)}
    itow = {i:w for i,w in enumerate(vocab)}
    
    return vocab, wtoi, itow, num_caption

def count_captions(captions):
    """ Count the number of captions """
    num_caption = 0
    for cap_ann in captions:
        cap_ann['tokenized_captions'] = []
        for cap in cap_ann['captions']:
            cap_ann['tokenized_captions'].append(
                    process_caption(cap['caption']))
            num_caption += 1
    print('Number of captions:', num_caption)
    return num_caption

def encode_annotations(anns, vocab, wtoi, itow, datatype, max_question_length=15):
    """ Encode annotations and save them
    Args:
        anns: annotations, 'list of dictionaries'
        vocab: vocabulary, 'list'
        wtoi: dictionary of word to index of vocabulary
        itow: dictionary of index to word of vocabulary
        datatype: data type (train|test), 'string'
        max_question_length: maximum length of each question when converting to the label
    """
    
    num_captions = count_captions(anns)
    question_length = np.zeros(num_captions, dtype='uint32')
    question_labels = np.zeros((num_captions, max_question_length), dtype='uint32')
    question_ids = []
    image_ids = []
    image_file_name = []
    
    idx_cap  = 0
    for ann in anns:
        img_id = ann['image_id']
        img_path = ann['image_path']
        for cap, info in zip(ann['tokenized_captions'], ann['captions']):
            # numpy data
            question_length[idx_cap] = min(len(cap), max_question_length)
            for i,w in enumerate(cap):
                if i == (max_question_length-1):
                    question_labels[idx_cap, i] = wtoi['<E>']
                    break
                if w in vocab:
                    question_labels[idx_cap, i] = wtoi[w]
                else:
                    question_labels[idx_cap, i] = wtoi['<UNK>']
            
            # json data
            question_ids.append(info['id'])
            image_file_name.append(img_path)
            image_ids.append(img_id)
            
            idx_cap += 1
    
    
    # save the numpy data
    save_h5py_path = '%s.h5' % datatype
    f = h5py.File(save_h5py_path, 'w')
    f.create_dataset('question_labels', dtype='uint32',
                    data=question_labels)
    f.create_dataset('question_length', dtype='uint32',
                    data=question_length)
    f.close()
    print('Save %s h5py file to %s' % (datatype, save_h5py_path))
    
    # save the list data
    save_json_path = '%s.json' % datatype
    save_json_file = {}
    save_json_file['wtoi'] = wtoi
    save_json_file['itow'] = itow
    save_json_file['question_ids'] = question_ids
    save_json_file['image_ids'] = image_ids
    save_json_file['image_path'] = image_file_name
    write_json(save_json_path, save_json_file)
    print('Save %s json file to %s' % (datatype, save_json_path))

### Load MSCOCO Caption Annotations for train/test

In [3]:
train_info_file = 'train_caption_info.json'
test_info_file = 'test_caption_info.json'

train_info = load_json(train_info_file)
test_info = load_json(test_info_file)

### Encoding annotations

In [42]:
# Construct vocabulary for training data
vocab, wtoi, itow, num_caption = create_vocab(train_info)

Total words: 7353
Words in vocabulary: 3244
Top 20 words:
 ['<UNK>', 'a', '<E>', '<S>', '.', 'on', 'of', 'the', 'in', 'with', 'and', 'is', 'man', 'to', 'sitting', 'an', 'two', ',', 'at', 'standing']


In [43]:
# encode labels and lengths of questions
encode_annotations(train_info, vocab, wtoi, itow, 'train')
encode_annotations(test_info, vocab, wtoi, itow, 'test')

Number of captions: 25010
Save train h5py file to train.h5
Number of captions: 5001
Save test h5py file to test.h5
