# Image Captioning

Observing that people who are blind have relied on (human-based) image captioning services to learn about images they take for nearly a decade, we introduce the first image captioning dataset to represent this real use case. This new dataset, which we call VizWiz-Captions, consists of 39,181 images originating from people who are blind that are each paired with 5 captions. Our proposed challenge addresses the task of predicting a suitable caption given an image. Ultimately, we hope this work will educate more people about the technological needs of blind people while providing an exciting new opportunity for researchers to develop assistive technologies that eliminate accessibility barriers for blind people (https://vizwiz.org/tasks-and-datasets/image-captioning/).

The goal of this Challenge is to create a single model similar to https://arxiv.org/pdf/1411.4555.pdf to get reasonable results on this task.

In [None]:
from vizwiz_api.vizwiz import VizWiz
from vizwiz_eval_cap.eval import VizWizEvalCap
import matplotlib.pyplot as plt
from PIL import Image
import skimage.io as io
import pylab
import numpy as np
from sklearn.preprocessing import StandardScaler

from gensim.models import Word2Vec
from nltk.tokenize import sent_tokenize, word_tokenize

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from torchvision import transforms

import json
from jsonpath_ng import jsonpath, parse
from json import encoder
encoder.FLOAT_REPR = lambda o: format(o, '.3f')

In [None]:
 def get_alloc_dicts(set_name, vizwiz=None):
    # be sure if `vizwiz` is set, that it contains the `set_name` dataset
    if (set_name != 'train') and (set_name != 'val') and (set_name != 'test'):
        raise Exception('only "train", "val" or "test" is a valid `set_name`')
    
    if not isinstance(vizwiz, VizWiz):
        ann_path = './annotations/'+set_name+'.json'
        vizwiz = VizWiz(ann_path, ignore_rejected=True, ignore_precanned=True)
    
    img_path_prefix = './images/'+set_name+'/'
    imgIdx_imgPath = {vizwiz.imgs[i]['id']:img_path_prefix+vizwiz.imgs[i]['file_name'] for i in vizwiz.imgs}
    capIdx_imgIdx = {vizwiz.anns[i]['id']:vizwiz.anns[i]['image_id'] for i in vizwiz.anns}
    enumIdx_capIdx = {idx:vizwiz.anns[i]['id'] for idx, i in enumerate(vizwiz.anns)}
    capIdx_cap = {vizwiz.anns[i]['id']:vizwiz.anns[i]['caption'] for i in vizwiz.anns}
        
    return imgIdx_imgPath, capIdx_imgIdx, enumIdx_capIdx, capIdx_cap

In [None]:
ann_train = './annotations/train.json'
vizwiz_train = VizWiz(ann_train, ignore_rejected=True, ignore_precanned=True)
imgIdx_imgPath_train, capIdx_imgIdx_train, enumIdx_capIdx_train, capIdx_cap_train = get_alloc_dicts('train', vizwiz_train)

In [None]:
cap1_idx = enumIdx_capIdx_train[0]
cap1 = capIdx_cap_train[cap1_idx]
print('caption:', cap1)
img1_idx = capIdx_imgIdx_train[cap1_idx]
img1 = Image.open(imgIdx_imgPath_train[img1_idx])
_ = plt.figure(figsize=(7,5))
_ = plt.imshow(img1)

In [None]:
resize_shape = (256,256)
resizer = transforms.Compose([transforms.Resize(resize_shape)])

img1_resized = resizer(img1)
_ = plt.figure(figsize=(7,5))
_ = plt.imshow(img1_resized)

In [None]:
def load_train_channel_means_and_sigmas():
    with open('./images/train_means.npy', 'rb') as f:
        train_channel_means = np.load(f)
    with open('./images/train_sigmas.npy', 'rb') as f:
        train_channels_sigmas = np.load(f)
    return train_channel_means, train_channels_sigmas

def load_imgs(imgIdx_imgPath, resizer, standardized=False):
    # loading all images will take a while (needs about 4GB RAM for training set)
    if standardized:
        train_channel_means, train_channels_sigmas = load_train_channel_means_and_sigmas()
    imgs = []
    for i in imgIdx_imgPath:
        img = Image.open(img_paths[i])
        img_resized = np.asarray(resizer(img))
        if standardized: # element wise standardization to avoid RAM issues
            img_resized = (img_resized - train_channel_means) / train_channels_sigmas
        imgs.append(img_resized)
        
    imgs = np.array(imgs) # convert to numpy tensor as array
    return imgs
        
X_train = load_imgs(imgIdx_imgPath, resizer, standardized=True)

In [None]:
def calc_train_channel_means_and_sigmas(X_train):
    """Calculates the mean and stds of the 3 RGB channels and stores it in a .npy file
    Performs batch-wise calculation of sum to avoid RAM issues"""
    train_channel_means = X_train.mean(axis=(0,1,2))

    std_batch_size = 1000
    std_sum = 0
    std_n = X_train.shape[0]*X_train.shape[1]*X_train.shape[2]
    std_idx = np.arange(0,X_train.shape[0]+std_batch_size, std_batch_size)
    for i in range(std_idx.shape[0]-1):
        start_idx, end_idx = std_idx[i], std_idx[i+1]
        std_batch = X_train[start_idx:end_idx]
        batch_sum = np.sum((std_batch - train_channel_means)**2, axis=(0,1,2))
        std_sum += batch_sum
    train_channels_sigmas = np.sqrt(std_sum / std_n)
    
    with open('./images/train_means.npy', 'wb') as f:
        np.save(f, train_channel_means)
    with open('./images/train_sigmas.npy', 'wb') as f:
        np.save(f, train_channels_sigmas)
    return train_channel_means, train_channels_sigmas
        
# calc_train_channel_means_and_sigmas(X_train)

In [None]:
sentence = capIdx_cap_train[1]
sentence

In [None]:
_ = """data = []
  
# iterate through each sentence in the file
for i in sent_tokenize(sentence):
    temp = []
      
    # tokenize the sentence into words
    for j in word_tokenize(i):
        temp.append(j.lower())
    data.append(temp)"""

## Model Setup

In [None]:
class ImageCaptioning(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X)
        self.y = torch.from_numpy(y).long().T[0]

    def __len__(self):
        return self.y.shape[0]

    def __getitem__(self, idx):
        activity = self.X[idx]
        label = self.y[idx]
        return activity, label