# One-to-Many Network for Image Captioning

This notebook uses the annotated COCO dataset.

In [None]:
# Data preparation
# Create the data directories.
!mkdir data
!mkdir data/coco
# Download the annotations.
!wget http://images.cocodataset.org/annotations/annotations_trainval2014.zip
# Unzip annotations to the coco folder
!unzip annotations_trainval2014.zip -d data/coco
# Delete the zip file
!rm annotations_trainval2014.zip
# Create output folder
!mkdir output
!mkdir output/feature_vectors

In [None]:
# Download the data itself
!wget http://images.cocodataset.org/zips/train2014.zip
# Unzip the dataset into data/coco
!unzip train2014.zip -d data/coco
# Free up some space.
!rm train2014.zip

TENSORFLOW

In [None]:
# Imports
import json
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.applications import VGG19
from tensorflow.keras.applications.vgg19 import preprocess_input
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing.image import img_to_array
import pickle
import gzip
import logging
from pathlib import Path
from tqdm import tqdm
tf.get_logger().setLevel(logging.ERROR)

# Set directories.
TRAINING_FILE_DIR = Path('data/coco')
OUTPUT_FILE_DIR = Path('output/feature_vectors')

In [None]:
# Preprocessing the image captions.
# Open the file using a context manager.
with open(TRAINING_FILE_DIR / 'annotations/captions_train2014.json') as captions:
  data = json.load(captions)

image_dict = {}
# Get the image filenames.
for image in data['images']:
  image_dict[image['id']] = [image['file_name']]
# Get the annotations, each id is assigned a list, the first element
# represents the filename, the subsequent elements are captions.
for annotations in data['annotations']:
  image_dict[annotations['image_id']].append(annotations['caption'])

In [None]:
# The model implements an encoder-decoder architecture.
# The encoder is a pretrained VGG19 model.
model = VGG19(weights='imagenet')
# model.summary()
# Find the name of the last network, -> block5_conv4
encoder = Model(inputs=model.input, 
                  outputs=model.get_layer('block5_conv4').output)
encoder.summary()

In [None]:
# To save on computation, the VGG19 won't be retrained,
# instead for each image, an output vector of the model's
# forward pass will be stored.
for idx, key in tqdm(enumerate(image_dict.keys()), desc='Progress: '):
  item = image_dict.get(key)
  filename = TRAINING_FILE_DIR / 'train2014' / item[0]
  # Determine dimensions.
  image = load_img(filename)
  width = image.size[0]
  height = image.size[1]
  # Resize so shortest side is 256 pixels.
  if height > width:
    image = load_img(filename, target_size=(int(height/width*256), 256))
  else:
    image = load_img(filename, target_size=(256, int(width/height*256)))
  width = image.size[0]
  height = image.size[1]
  image_np = img_to_array(image)
  # Crop to center 224x224 region.
  h_start = int((height-224)/2)
  w_start = int((width-224)/2)
  image_np = image_np[
    h_start:h_start+224,
    w_start:w_start+224
  ]
  # Rearrange array to have one more
  # dimension representing batch size = 1.
  image_np = np.expand_dims(image_np, axis=0)
  # Call model and save resultin tensor to disk.
  X = preprocess_input(image_np)
  y = encoder.predict(X)
  save_filename = OUTPUT_FILE_DIR / f'{item[0]}.pickle.gzip'
  pickle_file = gzip.open(save_filename, 'wb')
  pickle.dump(y[0], pickle_file)
  pickle_file.close() 

# Save the dictionary containing captions and filenames.
save_filename = OUTPUT_FILE_DIR + 'caption_file.pickle.gz'
pickle_file = gzip.open(save_filename, 'wb')
pickle.dump(image_dict, pickle_file)
pickle_file.close()