## Importing Required Libraries

In [7]:
import nltk   ## NLP Library
import pickle
from collections import Counter
from pycocotools.coco import COCO  ## Library for using COCO Dataset
import os
from PIL import Image ## Image Processing
import torch
import torchvision.transforms as transforms
import torch.utils.data as data
import os
import pickle
import numpy as np
import nltk
from PIL import Image

## Tokenization and Building Vocabulory from Captions:

In [None]:
class Vocabulary(object):
    """Simple vocabulary wrapper."""
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

    def add_word(self, word):
        if not word in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def __call__(self, word):
        if not word in self.word2idx:
            return self.word2idx['<unk>']
        return self.word2idx[word]

    def __len__(self):
        return len(self.word2idx)

def build_vocab(json, threshold):
    """Build a simple vocabulary wrapper."""
    coco = COCO(json)
    counter = Counter()
    ids = coco.anns.keys()
    for i, id in enumerate(ids):
        caption = str(coco.anns[id]['caption'])
        tokens = nltk.tokenize.TreebankWordTokenizer().tokenize(caption.lower())
        counter.update(tokens)

        if (i+1) % 1000 == 0:
            print("[{}/{}] Tokenized the captions.".format(i+1, len(ids)))

    # If the word frequency is less than 'threshold', then the word is discarded.
    words = [word for word, cnt in counter.items() if cnt >= threshold]

    # Create a vocab wrapper and add some special tokens.
    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

    # Add the words to the vocabulary.
    for i, word in enumerate(words):
        vocab.add_word(word)
    return vocab

def main():
    caption_path='/home/navish/Desktop/MSCOCO/annotations/captions_train2014.json'
    vocab_path='/home/navish/Desktop/MSCOCO/vocab.pkl'
    threshold=4
    vocab = build_vocab(json=caption_path, threshold=threshold)
    with open(vocab_path, 'wb') as f:
        pickle.dump(vocab, f)
    print("Total vocabulary size: {}".format(len(vocab)))
    print("Saved the vocabulary wrapper to '{}'".format(vocab_path))

if __name__ == '__main__':
    main()

In [9]:
with open('/home/navish/Desktop/MSCOCO/vocab.pkl','rb') as fp:
    vocab=pickle.load(fp)
vocab

<__main__.Vocabulary at 0x7fa0e8298550>

## Resizing Images and Storing them in Parent Directory :  

In [None]:
def resize_image(image, size):
    """Resize an image to the given size."""
    return image.resize(size, Image.ANTIALIAS)

def resize_images(image_dir, output_dir, size):
    """Resize the images in 'image_dir' and save into 'output_dir'."""
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    images = os.listdir(image_dir)
    num_images = len(images)
    for i, image in enumerate(images):
        with open(os.path.join(image_dir, image), 'r+b') as f:
            with Image.open(f) as img:
                img = resize_image(img, size)
                img.save(os.path.join(output_dir, image), img.format)
        if (i+1) % 100 == 0:
            print ("[{}/{}] Resized the images and saved into '{}'."
                   .format(i+1, num_images, output_dir))

def main():
    image_dir = '/home/navish/Desktop/MSCOCO/test2014/'
    output_dir = '/home/navish/Desktop/MSCOCO/resizedval2014/'
    image_size = (256,256)
    resize_images(image_dir, output_dir, image_size)


if __name__ == '__main__':
    main()

## Data Loading | Format required for Pytorch Data Loader

In [50]:
class CocoDataset(data.Dataset):
    """COCO Custom Dataset compatible with torch.utils.data.DataLoader."""
    def __init__(self, root, json, vocab, transform=None):
        """Set the path for images, captions and vocabulary wrapper.
        
        Args:
            root: image directory.
            json: coco annotation file path.
            vocab: vocabulary wrapper.
            transform: image transformer.
        """
        self.root = root
        self.coco = COCO(json)
        self.ids = list(self.coco.anns.keys())
        self.vocab = vocab
        self.transform = transform

    def __getitem__(self, index):
        """Returns one data pair (image and caption)."""
        coco = self.coco
        vocab = self.vocab
        ann_id = self.ids[index]
        caption = coco.anns[ann_id]['caption']
        img_id = coco.anns[ann_id]['image_id']
#         print(coco.loadImgs(img_id)[0])
        path = coco.loadImgs(img_id)[0]['file_name']
#         path = '/home/navish/Desktop/MSCOCO/output_dir/'
#         try:
        image = Image.open(os.path.join(self.root, path)).convert('RGB')
        if self.transform is not None:
            image = self.transform(image)
                # Convert caption (string) to word ids.
        tokens = nltk.tokenize.TreebankWordTokenizer().tokenize(caption.lower())
        caption = []
        caption.append(vocab('<start>'))
        caption.extend([vocab(token) for token in tokens])
        caption.append(vocab('<end>'))
        target = torch.Tensor(caption)
        return image, target
#         except:
#             pass

    def __len__(self):
        return len(self.ids)


def collate_fn(data):
    """Creates mini-batch tensors from the list of tuples (image, caption).
    
    We should build custom collate_fn rather than using default collate_fn, 
    because merging caption (including padding) is not supported in default.

    Args:
        data: list of tuple (image, caption). 
            - image: torch tensor of shape (3, 256, 256).
            - caption: torch tensor of shape (?); variable length.

    Returns:
        images: torch tensor of shape (batch_size, 3, 256, 256).
        targets: torch tensor of shape (batch_size, padded_length).
        lengths: list; valid length for each padded caption.
    """
    # Sort a data list by caption length (descending order).
    data.sort(key=lambda x: len(x[1]), reverse=True)
    images, captions = zip(*data)

    # Merge images (from tuple of 3D tensor to 4D tensor).
    images = torch.stack(images, 0)

    # Merge captions (from tuple of 1D tensor to 2D tensor).
    lengths = [len(cap) for cap in captions]
    targets = torch.zeros(len(captions), max(lengths)).long()
    for i, cap in enumerate(captions):
        end = lengths[i]
        targets[i, :end] = cap[:end]        
    return images, targets, lengths

def get_loader(root, json, vocab, transform, batch_size, shuffle, num_workers):
    """Returns torch.utils.data.DataLoader for custom coco dataset."""
    # COCO caption dataset
    coco = CocoDataset(root=root,
                       json=json,
                       vocab=vocab,
                       transform=transform)
    
    # Data loader for COCO dataset
    # This will return (images, captions, lengths) for each iteration.
    # images: a tensor of shape (batch_size, 3, 224, 224).
    # captions: a tensor of shape (batch_size, padded_length).
    # lengths: a list indicating valid length for each caption. length is (batch_size).
    data_loader = torch.utils.data.DataLoader(dataset=coco, 
                                              batch_size=batch_size,
                                              shuffle=shuffle,
                                              num_workers=num_workers,
                                              collate_fn=collate_fn)
    return data_loader

## Defining Encoder and Decoder Architechtures:

In [11]:
import torch
import torch.autograd
import torch.nn as nn
import torchvision.models as models
from torch.nn.utils.rnn import pack_padded_sequence


class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        """Load the pretrained ResNet-152 and replace top fc layer."""
        super(EncoderCNN, self).__init__()
        resnet = models.resnet152(pretrained=True)
        modules = list(resnet.children())[:-1]      # delete the last fc layer.
        self.resnet = nn.Sequential(*modules)
        self.linear = nn.Linear(resnet.fc.in_features, embed_size)
        self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)
        
    def forward(self, images):
        """Extract feature vectors from input images."""
        with torch.autograd.no_grad():
            features = self.resnet(images)
        features = features.reshape(features.size(0), -1)
        features = self.bn(self.linear(features))
        return features


class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, max_seq_length=20):
        """Set the hyper-parameters and build the layers."""
        super(DecoderRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.max_seg_length = max_seq_length
        
    def forward(self, features, captions, lengths):
        """Decode image feature vectors and generates captions."""
        embeddings = self.embed(captions)
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        packed = pack_padded_sequence(embeddings, lengths, batch_first=True) 
        hiddens, _ = self.lstm(packed)
        outputs = self.linear(hiddens[0])
        return outputs
    
    def sample(self, features, states=None):
        """Generate captions for given image features using greedy search."""
        sampled_ids = []
        inputs = features.unsqueeze(1)
        for i in range(self.max_seg_length):
            hiddens, states = self.lstm(inputs, states)          # hiddens: (batch_size, 1, hidden_size)
            outputs = self.linear(hiddens.squeeze(1))            # outputs:  (batch_size, vocab_size)
            _, predicted = outputs.max(1)                        # predicted: (batch_size)
            sampled_ids.append(predicted)
            inputs = self.embed(predicted)                       # inputs: (batch_size, embed_size)
            inputs = inputs.unsqueeze(1)                         # inputs: (batch_size, 1, embed_size)
        sampled_ids = torch.stack(sampled_ids, 1)                # sampled_ids: (batch_size, max_seq_length)
        return sampled_ids

## Training the model to reduce Perplexity : 

In [None]:
import torch
import torch.nn as nn
import numpy as np
import os
import pickle
from torch.nn.utils.rnn import pack_padded_sequence
from torchvision import transforms
from torch.autograd import Variable
from sklearn.metrics import confusion_matrix,classification_report

# Device configuration
x=torch.cuda.current_device()
# device = torch.cuda.device('cuda')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.cuda.device(x)

def main():
     
    model_path='/home/navish/Desktop/MSCOCO/models/'
    crop_size=224 
    vocab_path='/home/navish/Desktop/MSCOCO/vocab.pkl'
    image_dir='/home/navish/Desktop/MSCOCO/resized2014'
    caption_path='/home/navish/Desktop/MSCOCO/annotations/captions_train2014.json'
    log_step=10
    save_step=100
    
    # Model parameters
    embed_size=256
    hidden_size=512
    num_layers=1
    
    num_epochs=60
    batch_size=128
    num_workers=2
    learning_rate=0.001 #1e-10
    # Create model directory
    if not os.path.exists(model_path):
        os.makedirs(model_path)
    
    # Image preprocessing, normalization for the pretrained resnet
    transform = transforms.Compose([ 
        transforms.RandomCrop(crop_size),
        transforms.RandomHorizontalFlip(), 
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open(vocab_path, 'rb') as f:
        vocab = pickle.load(f)
        
    
    
    # Build data loader
    data_loader = get_loader(image_dir,caption_path, vocab, 
                             transform, batch_size,
                             shuffle=True, num_workers=num_workers) 
    
    # Build the models
    encoder = EncoderCNN(embed_size).to(device=device,dtype=torch.float32,non_blocking=True)
    decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers).to(device=device,dtype=torch.float32,non_blocking=True)
    
    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters())
    
    optimizer = torch.optim.Adam(params, lr=learning_rate)
    
    
            
   
    # Train the models
    total_step = len(data_loader)
    for epoch in range(num_epochs):
        for i, (images, captions, lengths) in enumerate(data_loader):            
            # Set mini-batch dataset
            images = images.to(device)
            captions = captions.to(device)
            targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]        
            # Forward, backward and optimize
            features = encoder(images)
            outputs = decoder(features, captions, lengths)
            outputs.requires_grad_(True)
            loss = criterion(outputs, targets)
            
            
           
            decoder.zero_grad()
            encoder.zero_grad()
            loss.backward()
            optimizer.step()
            
            # Print log info
            if i % log_step == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'
                      .format(epoch, num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) 
                
              
            # Save the model checkpoints
            if (i+1) % save_step == 0:
                torch.save(decoder.state_dict(), os.path.join(
                    model_path, 'decoder-bid-{}-{}.ckpt'.format(epoch+1, i+1)))
                torch.save(encoder.state_dict(), os.path.join(
                    model_path, 'encoder-bid-{}-{}.ckpt'.format(epoch+1, i+1)))   
                
            
              
                
if __name__ == '__main__':
        main() 

* Some of the images are in __grey scale__ so they need to be converted into __RGB Channels__ for formatted Input

In [None]:
#grey_scale to 3 channel conversion
from os import listdir
from os.path import isfile, join
from PIL import Image 
import numpy as np

files = [f for f in listdir('/home/navish/Desktop/MSCOCO/resizedval2014/') if isfile(join('/home/nsvish/Desktop/MSCOCO/resizedval2014/', f))]
for i in range(0,len(files)):
    img =Image.open('/home/navish/Desktop/MSCOCO/resizedval2014/'+files[i])
    nchannels=3
    A= np.asarray(img)
    if ((A.shape==(256,256))):
        print(i)
        stacked_img = np.stack((A,)*3, -1)
        nimg = Image.fromarray(stacked_img, 'RGB')
        nimg.save('/home/navish/Desktop/MSCOCO/resizedval2014/'+files[i])

##  Real Caption Generation for Test-Images(Without Language Modelling):

In [None]:
# import torch
import matplotlib.pyplot as plt
import numpy as np 
import pickle 
import os
from torchvision import transforms 
#from build_vocab import Vocabulary
#from model import EncoderCNN, DecoderRNN3
from PIL import Image


# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def load_image(image_path, transform=None):
    image = Image.open(image_path)
    image = image.resize([224, 224], Image.LANCZOS)
    
    if transform is not None:
        image = transform(image).unsqueeze(0)
    
    return image

def main():
    
    from os import listdir
    from os.path import isfile, join
    files = [f for f in listdir('/home/navish/Desktop/MSCOCO/resizedval2014/') if isfile(join('/home/navish/Desktop/MSCOCO/resizedval2014/', f))]
    out_cap=[]
    for i in range(0,len(files)):
        image='/home/navish/Desktop/MSCOCO/resizedval2014/'+files[i]
        encoder_path='/home/navish/Desktop/MSCOCO/models/encoder-bid-60-3200.ckpt'
        decoder_path='/home/navish/Desktop/MSCOCO/models/decoder-bid-60-3200.ckpt'
        vocab_path='/home/navish/Desktop/MSCOCO/vocab.pkl'

        # Model parameters (should be same as paramters in train.py)
        embed_size=256
        hidden_size=512
        num_layers=1

        # Image preprocessing
        transform = transforms.Compose([
            transforms.ToTensor(), 
            transforms.Normalize((0.485, 0.456, 0.406), 
                                 (0.229, 0.224, 0.225))])

        # Load vocabulary wrapper
        with open(vocab_path, 'rb') as f:
            vocab = pickle.load(f)

        # Build models
        encoder = EncoderCNN(embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
        decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers)
        encoder = encoder.to(device)
        decoder = decoder.to(device)

        # Load the trained model parameters
        encoder.load_state_dict(torch.load(encoder_path))
        decoder.load_state_dict(torch.load(decoder_path))

        # Prepare an image
        image = load_image(image, transform)
        image_tensor = image.to(device)
        #print(image_tensor[0])
        # Generate an caption from the image
        feature = encoder(image_tensor)
        sampled_ids = decoder.sample(feature)
        sampled_ids = sampled_ids[0].cpu().numpy()          # (1, max_seq_length) -> (max_seq_length)

        # Convert word_ids to words
        sampled_caption = []
        for word_id in sampled_ids:
            word = vocab.idx2word[word_id]
            sampled_caption.append(word)
            if word == '<end>':
                break
        sentence = ' '.join(sampled_caption)
        print(sentence)
        # Print out the image and the generated caption
        out_cap.append(sentence)
        print ("appended", i)
    out_caption_path='/home/navish/Desktop/MSCOCO/out_cap.pkl'    
    with open(out_caption_path, 'wb') as g:
        pickle.dump(out_cap, g)
if __name__ == '__main__':
    main()

* __Mapping ann_id to img_id__

In [46]:
ann2img={}
json='/home/sanjeet/Desktop/MSCOCO/annotations/captions_test2014.json'
coco = COCO(json)
ids = list(coco.anns.keys())
for index in range(0,len(ids)):
    ann_id = ids[index]
    img_id = coco.anns[ann_id]['image_id']
    ann2img.setdefault(img_id, []).append(ann_id)
print(ann2img)

loading annotations into memory...
Done (t=0.38s)
creating index...
index created!
{}


## Calculating Loss Scores :

In [None]:
from os import listdir
from os.path import isfile, join

vocab_path='/home/navish/Desktop/MSCOCO/out_cap.pkl'
with open(vocab_path, 'rb') as g:
    out_caption = pickle.load(g)
    
from nltk.translate.bleu_score import sentence_bleu
scores=[]
files = [f for f in listdir('/home/navish/Desktop/MSCOCO/resizedval2014/') if isfile(join('/home/navish/Desktop/MSCOCO/resizedval2014/', f))]
for i in range(0,len(files)):
#     print(int(files[i][20:26]))
    indices=ann2img[int(files[i][20:26])]
    reference=[]
    for ann_id in indices:
        caption = coco.anns[ann_id]['caption']
        reference.append(caption)
        print("Reference Caption ",ann_id," : ",caption)
    candidate=out_caption[i][8:-6]  
    #print("Machine Generste : ",candidate)
    print(sentence_bleu(reference, candidate))
    scores.append(sentence_bleu(reference, candidate))
print("Average = ",(sum(scores)/len(scores)))

In [None]:
from os import listdir
from os.path import isfile, join
import pickle
import json 
data = []  
vocab_path='/home/navish/Desktop/MSCOCO/out_cap.pkl'
with open(vocab_path, 'rb') as g:
    out_caption = pickle.load(g)

files = [f for f in listdir('/home/navish/Desktop/MSCOCO/resizedval2014/') if isfile(join('/home/nsvish/Desktop/MSCOCO/resizedval2014/', f))]
for i in range(0,len(files)):
    print(files[i][19:25])
    print(int(files[i][19:25]))
    candidate=out_caption[i][8:-6]   
    #print("Machine Generste : ",candidate)
    print( candidate)
    data.append({  
    
    'image_id': int(files[i][19:25]),
        'caption':candidate 
    })
with open('apiData.json', 'w') as outfile:  
    json.dump(data, outfile)  
    
print("END")

## Language Modelling to improve Caption quality

In [None]:
import os
import sys
import subprocess
import tempfile
import itertools

# path to the stanford corenlp jar
STANFORD_CORENLP_3_4_1_JAR = '/home/navish/Downloads/stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1.jar'

# punctuations to be removed from the sentences
PUNCTUATIONS = ["''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", \
        ".", "?", "!", ",", ":", "-", "--", "...", ";"] 

class PTBTokenizer:
    """Python wrapper of Stanford PTBTokenizer"""

    def tokenize(self, captions_for_image):
        cmd = ['java', '-cp', STANFORD_CORENLP_3_9_1_JAR, \
                'edu.stanford.nlp.process.PTBTokenizer', \
                '-preserveLines', '-lowerCase']

        # ======================================================
        # prepare data for PTB Tokenizer
        # ======================================================
        final_tokenized_captions_for_image = {}
        image_id = [k for k, v in captions_for_image.items() for _ in range(len(v))]
        print ("22")
        apa=bytes('caption', 'utf-8')
        #apa=apa.encode('utf-8')
        sentences = '\n'.join([c[apa].replace('\n', ' ') for k, v in captions_for_image.items() for c in v])
        #sentences=sentences.encode() 
        print ("33")
        # ======================================================
        # save sentences to temporary file
        # ======================================================
        print("11")
        path_to_jar_dirname=os.path.dirname(os.path.abspath(__file__))
        tmp_file = tempfile.NamedTemporaryFile(delete=False, dir=path_to_jar_dirname)
        tmp_file.write(sentences)
        tmp_file.close()

        # ======================================================
        # tokenize sentence
        # ======================================================
        cmd.append(os.path.basename(tmp_file.name))
        p_tokenizer = subprocess.Popen(cmd, cwd=path_to_jar_dirname, \
                stdout=subprocess.PIPE)
        token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0]
        lines = token_lines.split('\n')
        # remove temp file
        os.remove(tmp_file.name)

        # ======================================================
        # create dictionary for tokenized captions
        # ======================================================
        for k, line in zip(image_id, lines):
            if not k in final_tokenized_captions_for_image:
                final_tokenized_captions_for_image[k] = []
            tokenized_caption = ' '.join([w for w in line.rstrip().split(' ') \
                    if w not in PUNCTUATIONS])
            final_tokenized_captions_for_image[k].append(tokenized_caption)

        return final_tokenized_captions_for_image

In [None]:
#from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider

class COCOEvalCap:
    def __init__(self,images,gts,res):
        self.evalImgs = []
        self.eval = {}
        self.imgToEval = {}
        self.params = {'image_id': images}
        self.gts = gts
        self.res = res

    def evaluate(self):
        imgIds = self.params['image_id']
        gts = self.gts
        res = self.res

        # =================================================
        # Set up scorers
        # =================================================
        print('tokenization...')
        tokenizer = PTBTokenizer()
        gts  = tokenizer.tokenize(gts)
        res = tokenizer.tokenize(res)

        # =================================================
        # Set up scorers
        # =================================================
        print('setting up scorers...')
        scorers = [
            (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
            (Meteor(),"METEOR"),
            (Rouge(), "ROUGE_L"),
            (Cider(), "CIDEr")
        ]

        # =================================================
        # Compute scores
        # =================================================
        eval = {}
        for scorer, method in scorers:
            print ('computing %s score...'%(scorer.method()))
            score, scores = scorer.compute_score(gts, res)
            if type(method) == list:
                for sc, scs, m in zip(score, scores, method):
                    self.setEval(sc, m)
                    self.setImgToEvalImgs(scs, imgIds, m)
                    print ("%s: %0.3f"%(m, sc))
            else:
                self.setEval(score, method)
                self.setImgToEvalImgs(scores, imgIds, method)
                print("%s: %0.3f"%(method, score))
        self.setEvalImgs()

    def setEval(self, score, method):
        self.eval[method] = score

    def setImgToEvalImgs(self, scores, imgIds, method):
        for imgId, score in zip(imgIds, scores):
            if not imgId in self.imgToEval:
                self.imgToEval[imgId] = {}
                self.imgToEval[imgId]["image_id"] = imgId
            self.imgToEval[imgId][method] = score

    def setEvalImgs(self):
        self.evalImgs = [eval for imgId, eval in self.imgToEval.items()]


def calculate_metrics(rng,datasetGTS,datasetRES):
    imgIds = rng
    gts = {}
    res = {}

    imgToAnnsGTS = {ann['image_id']: [] for ann in datasetGTS['annotations']}
    for ann in datasetGTS['annotations']:
        imgToAnnsGTS[ann['image_id']] += [ann]

    imgToAnnsRES = {ann['image_id']: [] for ann in datasetRES['annotations']}
    for ann in datasetRES['annotations']:
        imgToAnnsRES[ann['image_id']] += [ann]

    for imgId in imgIds:
        gts[imgId] = imgToAnnsGTS[imgId]
        res[imgId] = imgToAnnsRES[imgId]

    evalObj = COCOEvalCap(imgIds,gts,res)
    evalObj.evaluate()
    return evalObj.eval


In [None]:
import json
from pycocotools.coco import COCO
import collections

with open('apiData.json') as f:
    d= json.load(f)
da={}
for i in range (len(d)):
    da.setdefault((d[i]['image_id']), []).append(d[i]['caption'])

json='/home/sanjeet/Desktop/MSCOCO/annotations/captions_val2014.json'
coco = COCO(json)
ids = list(coco.anns.keys())
imgcap={}
img_arr=[]
for index in range(0,len(ids)):
    ann_id = ids[index]
    img_id = coco.anns[ann_id]['image_id']
    img_arr.append(img_id)
    cap= coco.anns[ann_id]['caption']
    #ann2img.setdefault(img_id, []).append(ann_id)
    imgcap.setdefault(img_id, []).append(cap)

od = collections.OrderedDict(sorted(da.items()))

if __name__ == '__main__':
    rng = img_arr

    res1=[]
    gts1=[]
    for k, v in od.items(): 
        #print(k, v)
        a={u'image_id': k, u'caption': v[0]}
        b1={u'image_id': k, u'caption': imgcap[k][0]}
        b2={u'image_id': k, u'caption': imgcap[k][1]}
        b3={u'image_id': k, u'caption': imgcap[k][2]}
        b4={u'image_id': k, u'caption': imgcap[k][3]}
        b5={u'image_id': k, u'caption': imgcap[k][4]}
        res1.append(a)
        gts1.append(b1)
        gts1.append(b2)
        gts1.append(b3)
        gts1.append(b4)
        gts1.append(b5)

    datasetRES = {'annotations':res1} 
    datasetGTS = {'annotations': gts1}
    print (calculate_metrics(rng,datasetGTS,datasetRES))