In [1]:
import torch
import torchvision.transforms as transforms
import torch.utils.data as data
import torch.nn as nn
import torchvision.models as models
from torch.nn.utils.rnn import pack_padded_sequence
from torch.autograd import Variable
import nltk
import pickle
import argparse
from collections import Counter
from pycocotools.coco import COCO
import os
import numpy as np
from PIL import Image

In [2]:
class Vocabulary(object):
    """Simple vocabulary wrapper."""
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

    def add_word(self, word):
        if not word in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def __call__(self, word):
        if not word in self.word2idx:
            return self.word2idx['<unk>']
        return self.word2idx[word]

    def __len__(self):
        return len(self.word2idx)

In [3]:
def build_vocab(json, threshold):
    """Build a simple vocabulary wrapper."""
    coco = COCO(json)
    counter = Counter()
    ids = coco.anns.keys()
    for i, id in enumerate(ids):
        caption = str(coco.anns[id]['caption'])
        tokens = nltk.tokenize.word_tokenize(caption.lower())
        counter.update(tokens)

        if i % 1000 == 0:
            print("[%d/%d] Tokenized the captions." %(i, len(ids)))

    # If the word frequency is less than 'threshold', then the word is discarded.
    words = [word for word, cnt in counter.items() if cnt >= threshold]

    # Creates a vocab wrapper and add some special tokens.
    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

    # Adds the words to the vocabulary.
    for i, word in enumerate(words):
        vocab.add_word(word)
    return vocab

In [18]:
vocab_path='./pytorch-tutorial/tutorials/03-advanced/image_captioning/data/vocab.pkl'
with open(vocab_path, 'wb') as f:
    pickle.dump(vocab, f)
print("Total vocabulary size: %d" %len(vocab))
print("Saved the vocabulary wrapper to '%s'" %vocab_path)

Total vocabulary size: 9956
Saved the vocabulary wrapper to './pytorch-tutorial/tutorials/03-advanced/image_captioning/data/vocab.pkl'


In [17]:
vocab = build_vocab(json='./pytorch-tutorial/tutorials/03-advanced/image_captioning/data/annotations/captions_train2014.json',
                        threshold=4)

loading annotations into memory...
Done (t=1.38s)
creating index...
index created!
[0/414113] Tokenized the captions.
[1000/414113] Tokenized the captions.
[2000/414113] Tokenized the captions.
[3000/414113] Tokenized the captions.
[4000/414113] Tokenized the captions.
[5000/414113] Tokenized the captions.
[6000/414113] Tokenized the captions.
[7000/414113] Tokenized the captions.
[8000/414113] Tokenized the captions.
[9000/414113] Tokenized the captions.
[10000/414113] Tokenized the captions.
[11000/414113] Tokenized the captions.
[12000/414113] Tokenized the captions.
[13000/414113] Tokenized the captions.
[14000/414113] Tokenized the captions.
[15000/414113] Tokenized the captions.
[16000/414113] Tokenized the captions.
[17000/414113] Tokenized the captions.
[18000/414113] Tokenized the captions.
[19000/414113] Tokenized the captions.
[20000/414113] Tokenized the captions.
[21000/414113] Tokenized the captions.
[22000/414113] Tokenized the captions.
[23000/414113] Tokenized the capt

[206000/414113] Tokenized the captions.
[207000/414113] Tokenized the captions.
[208000/414113] Tokenized the captions.
[209000/414113] Tokenized the captions.
[210000/414113] Tokenized the captions.
[211000/414113] Tokenized the captions.
[212000/414113] Tokenized the captions.
[213000/414113] Tokenized the captions.
[214000/414113] Tokenized the captions.
[215000/414113] Tokenized the captions.
[216000/414113] Tokenized the captions.
[217000/414113] Tokenized the captions.
[218000/414113] Tokenized the captions.
[219000/414113] Tokenized the captions.
[220000/414113] Tokenized the captions.
[221000/414113] Tokenized the captions.
[222000/414113] Tokenized the captions.
[223000/414113] Tokenized the captions.
[224000/414113] Tokenized the captions.
[225000/414113] Tokenized the captions.
[226000/414113] Tokenized the captions.
[227000/414113] Tokenized the captions.
[228000/414113] Tokenized the captions.
[229000/414113] Tokenized the captions.
[230000/414113] Tokenized the captions.


[411000/414113] Tokenized the captions.
[412000/414113] Tokenized the captions.
[413000/414113] Tokenized the captions.
[414000/414113] Tokenized the captions.


In [4]:
def resize_image(image, size):
    """Resize an image to the given size."""
    return image.resize(size, Image.ANTIALIAS)

In [5]:
def resize_images(image_dir, output_dir, size):
    """Resize the images in 'image_dir' and save into 'output_dir'."""
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    images = os.listdir(image_dir)
    num_images = len(images)
    for i, image in enumerate(images):
        with open(os.path.join(image_dir, image), 'r+b') as f:
            with Image.open(f) as img:
                img = resize_image(img, size)
                img.save(os.path.join(output_dir, image), img.format)
        if i % 100 == 0:
            print ("[%d/%d] Resized the images and saved into '%s'."
                   %(i, num_images, output_dir))

In [7]:
splits = ['train', 'val']
for split in splits:
    image_dir = 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/'+split+'2014'
    output_dir = 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/'+split+'_resized2014'
    image_size = [256, 256]
    resize_images(image_dir, output_dir, image_size)

[0/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[100/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[200/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[300/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[400/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[500/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[600/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[700/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/dat

[6400/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[6500/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[6600/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[6700/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[6800/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[6900/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[7000/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[7100/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_capt

[12800/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[12900/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[13000/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[13100/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[13200/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[13300/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[13400/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[13500/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/im

[19200/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[19300/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[19400/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[19500/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[19600/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[19700/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[19800/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[19900/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/im

[25600/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[25700/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[25800/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[25900/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[26000/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[26100/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[26200/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[26300/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/im

[32000/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[32100/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[32200/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[32300/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[32400/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[32500/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[32600/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[32700/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/im

[38400/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[38500/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[38600/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[38700/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[38800/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[38900/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[39000/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[39100/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/im

[44800/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[44900/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[45000/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[45100/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[45200/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[45300/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[45400/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[45500/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/im

[51200/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[51300/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[51400/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[51500/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[51600/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[51700/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[51800/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[51900/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/im

[57600/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[57700/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[57800/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[57900/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[58000/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[58100/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[58200/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[58300/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/im

[64000/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[64100/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[64200/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[64300/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[64400/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[64500/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[64600/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[64700/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/im

[70400/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[70500/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[70600/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[70700/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[70800/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[70900/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[71000/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[71100/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/im

[76800/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[76900/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[77000/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[77100/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[77200/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[77300/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[77400/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'.
[77500/82783] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/im

[400/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[500/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[600/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[700/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[800/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[900/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[1000/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[1100/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resi

[6900/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[7000/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[7100/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[7200/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[7300/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[7400/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[7500/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[7600/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/va

[13400/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[13500/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[13600/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[13700/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[13800/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[13900/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[14000/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[14100/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning

[19900/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[20000/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[20100/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[20200/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[20300/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[20400/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[20500/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[20600/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning

[26400/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[26500/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[26600/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[26700/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[26800/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[26900/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[27000/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[27100/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning

[32900/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[33000/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[33100/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[33200/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[33300/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[33400/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[33500/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[33600/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning

[39400/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[39500/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[39600/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[39700/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[39800/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[39900/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[40000/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/val_resized2014'.
[40100/40504] Resized the images and saved into 'pytorch-tutorial/tutorials/03-advanced/image_captioning

In [6]:
class CocoDataset(data.Dataset):
    """COCO Custom Dataset compatible with torch.utils.data.DataLoader."""
    def __init__(self, root, json, vocab, transform=None):
        """Set the path for images, captions and vocabulary wrapper.
        
        Args:
            root: image directory.
            json: coco annotation file path.
            vocab: vocabulary wrapper.
            transform: image transformer.
        """
        self.root = root
        self.coco = COCO(json)
        self.ids = list(self.coco.anns.keys())
        self.vocab = vocab
        self.transform = transform

    def __getitem__(self, index):
        """Returns one data pair (image and caption)."""
        coco = self.coco
        vocab = self.vocab
        ann_id = self.ids[index]
        caption = coco.anns[ann_id]['caption']
        img_id = coco.anns[ann_id]['image_id']
        path = coco.loadImgs(img_id)[0]['file_name']

        image = Image.open(os.path.join(self.root, path)).convert('RGB')
        if self.transform is not None:
            image = self.transform(image)

        # Convert caption (string) to word ids.
        tokens = nltk.tokenize.word_tokenize(str(caption).lower())
        caption = []
        caption.append(vocab('<start>'))
        caption.extend([vocab(token) for token in tokens])
        caption.append(vocab('<end>'))
        target = torch.Tensor(caption)
        return image, target

    def __len__(self):
        return len(self.ids)


def collate_fn(data):
    """Creates mini-batch tensors from the list of tuples (image, caption).
    
    We should build custom collate_fn rather than using default collate_fn, 
    because merging caption (including padding) is not supported in default.
    Args:
        data: list of tuple (image, caption). 
            - image: torch tensor of shape (3, 256, 256).
            - caption: torch tensor of shape (?); variable length.
    Returns:
        images: torch tensor of shape (batch_size, 3, 256, 256).
        targets: torch tensor of shape (batch_size, padded_length).
        lengths: list; valid length for each padded caption.
    """
    # Sort a data list by caption length (descending order).
    data.sort(key=lambda x: len(x[1]), reverse=True)
    images, captions = zip(*data)

    # Merge images (from tuple of 3D tensor to 4D tensor).
    images = torch.stack(images, 0)

    # Merge captions (from tuple of 1D tensor to 2D tensor).
    lengths = [len(cap) for cap in captions]
    targets = torch.zeros(len(captions), max(lengths)).long()
    for i, cap in enumerate(captions):
        end = lengths[i]
        targets[i, :end] = cap[:end]        
    return images, targets, lengths


def get_loader(root, json, vocab, transform, batch_size, shuffle, num_workers):
    """Returns torch.utils.data.DataLoader for custom coco dataset."""
    # COCO caption dataset
    coco = CocoDataset(root=root,
                       json=json,
                       vocab=vocab,
                       transform=transform)
    
    # Data loader for COCO dataset
    # This will return (images, captions, lengths) for every iteration.
    # images: tensor of shape (batch_size, 3, 224, 224).
    # captions: tensor of shape (batch_size, padded_length).
    # lengths: list indicating valid length for each caption. length is (batch_size).
    data_loader = torch.utils.data.DataLoader(dataset=coco, 
                                              batch_size=batch_size,
                                              shuffle=shuffle,
                                              num_workers=num_workers,
                                              collate_fn=collate_fn)
    return data_loader

In [7]:
class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        """Load the pretrained ResNet-152 and replace top fc layer."""
        super(EncoderCNN, self).__init__()
        resnet = models.resnet152(pretrained=True)
        modules = list(resnet.children())[:-1]      # delete the last fc layer.
        self.resnet = nn.Sequential(*modules)
        self.linear = nn.Linear(resnet.fc.in_features, embed_size)
        self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)
        self.init_weights()
        
    def init_weights(self):
        """Initialize the weights."""
        self.linear.weight.data.normal_(0.0, 0.02)
        self.linear.bias.data.fill_(0)
        
    def forward(self, images):
        """Extract the image feature vectors."""
        features = self.resnet(images)
        features = Variable(features.data)
        features = features.view(features.size(0), -1)
        features = self.bn(self.linear(features))
        return features

In [8]:
class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        """Set the hyper-parameters and build the layers."""
        super(DecoderRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.init_weights()
    
    def init_weights(self):
        """Initialize weights."""
        self.embed.weight.data.uniform_(-0.1, 0.1)
        self.linear.weight.data.uniform_(-0.1, 0.1)
        self.linear.bias.data.fill_(0)
        
    def forward(self, features, captions, lengths):
        """Decode image feature vectors and generates captions."""
        embeddings = self.embed(captions)
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        packed = pack_padded_sequence(embeddings, lengths, batch_first=True) 
        hiddens, _ = self.lstm(packed)
        outputs = self.linear(hiddens[0])
        return outputs
    
    def sample(self, features, states=None):
        """Samples captions for given image features (Greedy search)."""
        sampled_ids = []
        inputs = features.unsqueeze(1)
        for i in range(20):                                      # maximum sampling length
            hiddens, states = self.lstm(inputs, states)          # (batch_size, 1, hidden_size), 
            outputs = self.linear(hiddens.squeeze(1))            # (batch_size, vocab_size)
            predicted = outputs.max(1)[1]
            sampled_ids.append(predicted)
            inputs = self.embed(predicted)
            inputs = inputs.unsqueeze(1)                         # (batch_size, 1, embed_size)
        sampled_ids = torch.cat(sampled_ids, 1)                  # (batch_size, 20)
        return sampled_ids.squeeze()

In [9]:
def to_var(x, volatile=False):
    if torch.cuda.is_available():
        x = x.cuda()
    return Variable(x, volatile=volatile)

In [12]:
if not os.path.exists('pytorch-tutorial/tutorials/03-advanced/image_captioning/models/'):
    os.makedirs('pytorch-tutorial/tutorials/03-advanced/image_captioning/models/')

In [10]:
transform = transforms.Compose([ 
        transforms.RandomCrop(224),
        transforms.RandomHorizontalFlip(), 
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])

In [11]:
with open('pytorch-tutorial/tutorials/03-advanced/image_captioning/data/vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)

In [12]:
img_path='pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014'
caption_path='pytorch-tutorial/tutorials/03-advanced/image_captioning/data/annotations/captions_train2014.json'
batch_size=128
data_loader = get_loader(img_path,caption_path, vocab, 
                             transform, batch_size,
                             shuffle=True, num_workers=2)

loading annotations into memory...
Done (t=1.18s)
creating index...
index created!


In [13]:
embed_size=256
hidden_size=512
num_layers=1
encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers)

In [14]:
if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

In [15]:
learning_rate=0.001
criterion = nn.CrossEntropyLoss()
params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters())
optimizer = torch.optim.Adam(params, lr=learning_rate)

In [16]:
log_step=10
save_step=1000
embed_size=256
hidden_size=512
num_layers=1
num_epochs=5
batch_size=128
num_workers=2
learning_rate=0.001
model_path='pytorch-tutorial/tutorials/03-advanced/image_captioning/models/'

In [17]:
total_step = len(data_loader)

In [18]:
for epoch in range(num_epochs):
        for i, (images, captions, lengths) in enumerate(data_loader):
            
            # Set mini-batch dataset
            images = to_var(images, volatile=True)
            captions = to_var(captions)
            targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]
            # Forward, Backward and Optimize
            decoder.zero_grad()
            encoder.zero_grad()
            features = encoder(images)
            outputs = decoder(features, captions, lengths)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            # Print log info
            if i % log_step == 0:
                print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f'
                      %(epoch, num_epochs, i, total_step, 
                        loss.data[0], np.exp(loss.data[0]))) 
                
            # Save the models
            if (i+1) % save_step == 0:
                torch.save(decoder.state_dict(), 
                           os.path.join(model_path, 
                                        'decoder-%d-%d.pkl' %(epoch+1, i+1)))
                torch.save(encoder.state_dict(), 
                           os.path.join(model_path, 
                                        'encoder-%d-%d.pkl' %(epoch+1, i+1)))

Epoch [0/5], Step [0/3236], Loss: 9.1974, Perplexity: 9871.5627
Epoch [0/5], Step [10/3236], Loss: 6.6615, Perplexity: 781.6876
Epoch [0/5], Step [20/3236], Loss: 5.4381, Perplexity: 230.0063
Epoch [0/5], Step [30/3236], Loss: 5.1369, Perplexity: 170.1890
Epoch [0/5], Step [40/3236], Loss: 4.7731, Perplexity: 118.2828
Epoch [0/5], Step [50/3236], Loss: 4.5814, Perplexity: 97.6540
Epoch [0/5], Step [60/3236], Loss: 4.5598, Perplexity: 95.5642
Epoch [0/5], Step [70/3236], Loss: 4.5302, Perplexity: 92.7799
Epoch [0/5], Step [80/3236], Loss: 4.3768, Perplexity: 79.5837
Epoch [0/5], Step [90/3236], Loss: 4.2877, Perplexity: 72.7971
Epoch [0/5], Step [100/3236], Loss: 4.1852, Perplexity: 65.7062
Epoch [0/5], Step [110/3236], Loss: 4.1310, Perplexity: 62.2426
Epoch [0/5], Step [120/3236], Loss: 4.1538, Perplexity: 63.6772
Epoch [0/5], Step [130/3236], Loss: 3.8402, Perplexity: 46.5370
Epoch [0/5], Step [140/3236], Loss: 3.9060, Perplexity: 49.7013
Epoch [0/5], Step [150/3236], Loss: 3.7705, P

Epoch [0/5], Step [1280/3236], Loss: 2.4650, Perplexity: 11.7639
Epoch [0/5], Step [1290/3236], Loss: 2.4901, Perplexity: 12.0625
Epoch [0/5], Step [1300/3236], Loss: 2.6340, Perplexity: 13.9296
Epoch [0/5], Step [1310/3236], Loss: 2.5871, Perplexity: 13.2907
Epoch [0/5], Step [1320/3236], Loss: 2.4211, Perplexity: 11.2581
Epoch [0/5], Step [1330/3236], Loss: 2.5264, Perplexity: 12.5086
Epoch [0/5], Step [1340/3236], Loss: 2.4691, Perplexity: 11.8124
Epoch [0/5], Step [1350/3236], Loss: 2.4604, Perplexity: 11.7100
Epoch [0/5], Step [1360/3236], Loss: 2.5480, Perplexity: 12.7812
Epoch [0/5], Step [1370/3236], Loss: 2.6407, Perplexity: 14.0233
Epoch [0/5], Step [1380/3236], Loss: 2.5918, Perplexity: 13.3533
Epoch [0/5], Step [1390/3236], Loss: 2.4208, Perplexity: 11.2549
Epoch [0/5], Step [1400/3236], Loss: 2.4570, Perplexity: 11.6693
Epoch [0/5], Step [1410/3236], Loss: 2.3073, Perplexity: 10.0476
Epoch [0/5], Step [1420/3236], Loss: 2.3398, Perplexity: 10.3790
Epoch [0/5], Step [1430/3

Epoch [0/5], Step [2550/3236], Loss: 2.4379, Perplexity: 11.4487
Epoch [0/5], Step [2560/3236], Loss: 2.2336, Perplexity: 9.3338
Epoch [0/5], Step [2570/3236], Loss: 2.4801, Perplexity: 11.9420
Epoch [0/5], Step [2580/3236], Loss: 2.4114, Perplexity: 11.1493
Epoch [0/5], Step [2590/3236], Loss: 2.2525, Perplexity: 9.5120
Epoch [0/5], Step [2600/3236], Loss: 2.2871, Perplexity: 9.8460
Epoch [0/5], Step [2610/3236], Loss: 2.0944, Perplexity: 8.1206
Epoch [0/5], Step [2620/3236], Loss: 2.1983, Perplexity: 9.0098
Epoch [0/5], Step [2630/3236], Loss: 2.2431, Perplexity: 9.4229
Epoch [0/5], Step [2640/3236], Loss: 2.2250, Perplexity: 9.2538
Epoch [0/5], Step [2650/3236], Loss: 2.2118, Perplexity: 9.1322
Epoch [0/5], Step [2660/3236], Loss: 2.3679, Perplexity: 10.6752
Epoch [0/5], Step [2670/3236], Loss: 2.3916, Perplexity: 10.9314
Epoch [0/5], Step [2680/3236], Loss: 2.2225, Perplexity: 9.2304
Epoch [0/5], Step [2690/3236], Loss: 2.2422, Perplexity: 9.4139
Epoch [0/5], Step [2700/3236], Loss

Epoch [1/5], Step [600/3236], Loss: 2.1908, Perplexity: 8.9420
Epoch [1/5], Step [610/3236], Loss: 2.1502, Perplexity: 8.5866
Epoch [1/5], Step [620/3236], Loss: 2.0879, Perplexity: 8.0683
Epoch [1/5], Step [630/3236], Loss: 2.2484, Perplexity: 9.4723
Epoch [1/5], Step [640/3236], Loss: 2.1152, Perplexity: 8.2915
Epoch [1/5], Step [650/3236], Loss: 2.1993, Perplexity: 9.0187
Epoch [1/5], Step [660/3236], Loss: 2.0332, Perplexity: 7.6385
Epoch [1/5], Step [670/3236], Loss: 2.1780, Perplexity: 8.8286
Epoch [1/5], Step [680/3236], Loss: 2.0692, Perplexity: 7.9181
Epoch [1/5], Step [690/3236], Loss: 2.2624, Perplexity: 9.6057
Epoch [1/5], Step [700/3236], Loss: 2.1921, Perplexity: 8.9542
Epoch [1/5], Step [710/3236], Loss: 2.1205, Perplexity: 8.3351
Epoch [1/5], Step [720/3236], Loss: 2.2174, Perplexity: 9.1834
Epoch [1/5], Step [730/3236], Loss: 2.1950, Perplexity: 8.9802
Epoch [1/5], Step [740/3236], Loss: 2.2597, Perplexity: 9.5802
Epoch [1/5], Step [750/3236], Loss: 2.2222, Perplexity:

Epoch [1/5], Step [1890/3236], Loss: 2.0883, Perplexity: 8.0713
Epoch [1/5], Step [1900/3236], Loss: 2.0891, Perplexity: 8.0777
Epoch [1/5], Step [1910/3236], Loss: 2.2313, Perplexity: 9.3120
Epoch [1/5], Step [1920/3236], Loss: 2.1863, Perplexity: 8.9020
Epoch [1/5], Step [1930/3236], Loss: 2.1473, Perplexity: 8.5615
Epoch [1/5], Step [1940/3236], Loss: 2.1688, Perplexity: 8.7480
Epoch [1/5], Step [1950/3236], Loss: 2.0900, Perplexity: 8.0851
Epoch [1/5], Step [1960/3236], Loss: 2.1276, Perplexity: 8.3947
Epoch [1/5], Step [1970/3236], Loss: 2.0612, Perplexity: 7.8551
Epoch [1/5], Step [1980/3236], Loss: 2.1858, Perplexity: 8.8974
Epoch [1/5], Step [1990/3236], Loss: 2.1422, Perplexity: 8.5183
Epoch [1/5], Step [2000/3236], Loss: 2.1503, Perplexity: 8.5873
Epoch [1/5], Step [2010/3236], Loss: 2.1082, Perplexity: 8.2331
Epoch [1/5], Step [2020/3236], Loss: 2.0870, Perplexity: 8.0610
Epoch [1/5], Step [2030/3236], Loss: 2.0126, Perplexity: 7.4827
Epoch [1/5], Step [2040/3236], Loss: 2.1

Epoch [1/5], Step [3170/3236], Loss: 2.0404, Perplexity: 7.6936
Epoch [1/5], Step [3180/3236], Loss: 2.1099, Perplexity: 8.2473
Epoch [1/5], Step [3190/3236], Loss: 2.0794, Perplexity: 7.9998
Epoch [1/5], Step [3200/3236], Loss: 2.0744, Perplexity: 7.9595
Epoch [1/5], Step [3210/3236], Loss: 1.9252, Perplexity: 6.8568
Epoch [1/5], Step [3220/3236], Loss: 1.9894, Perplexity: 7.3113
Epoch [1/5], Step [3230/3236], Loss: 2.0755, Perplexity: 7.9682
Epoch [2/5], Step [0/3236], Loss: 1.9660, Perplexity: 7.1419
Epoch [2/5], Step [10/3236], Loss: 1.9332, Perplexity: 6.9116
Epoch [2/5], Step [20/3236], Loss: 2.0087, Perplexity: 7.4539
Epoch [2/5], Step [30/3236], Loss: 2.0289, Perplexity: 7.6059
Epoch [2/5], Step [40/3236], Loss: 1.9663, Perplexity: 7.1441
Epoch [2/5], Step [50/3236], Loss: 1.9469, Perplexity: 7.0067
Epoch [2/5], Step [60/3236], Loss: 1.9641, Perplexity: 7.1286
Epoch [2/5], Step [70/3236], Loss: 1.9639, Perplexity: 7.1274
Epoch [2/5], Step [80/3236], Loss: 2.0503, Perplexity: 7.

Epoch [2/5], Step [1230/3236], Loss: 1.8755, Perplexity: 6.5238
Epoch [2/5], Step [1240/3236], Loss: 2.0026, Perplexity: 7.4085
Epoch [2/5], Step [1250/3236], Loss: 1.8757, Perplexity: 6.5256
Epoch [2/5], Step [1260/3236], Loss: 2.0238, Perplexity: 7.5669
Epoch [2/5], Step [1270/3236], Loss: 1.9193, Perplexity: 6.8159
Epoch [2/5], Step [1280/3236], Loss: 1.9799, Perplexity: 7.2421
Epoch [2/5], Step [1290/3236], Loss: 1.9843, Perplexity: 7.2740
Epoch [2/5], Step [1300/3236], Loss: 2.1427, Perplexity: 8.5220
Epoch [2/5], Step [1310/3236], Loss: 1.9503, Perplexity: 7.0308
Epoch [2/5], Step [1320/3236], Loss: 1.8490, Perplexity: 6.3538
Epoch [2/5], Step [1330/3236], Loss: 1.9690, Perplexity: 7.1633
Epoch [2/5], Step [1340/3236], Loss: 2.0228, Perplexity: 7.5593
Epoch [2/5], Step [1350/3236], Loss: 1.9500, Perplexity: 7.0284
Epoch [2/5], Step [1360/3236], Loss: 1.9670, Perplexity: 7.1492
Epoch [2/5], Step [1370/3236], Loss: 1.9975, Perplexity: 7.3704
Epoch [2/5], Step [1380/3236], Loss: 2.0

Epoch [2/5], Step [2520/3236], Loss: 1.9720, Perplexity: 7.1848
Epoch [2/5], Step [2530/3236], Loss: 1.9733, Perplexity: 7.1943
Epoch [2/5], Step [2540/3236], Loss: 1.9476, Perplexity: 7.0120
Epoch [2/5], Step [2550/3236], Loss: 2.0523, Perplexity: 7.7856
Epoch [2/5], Step [2560/3236], Loss: 1.9395, Perplexity: 6.9552
Epoch [2/5], Step [2570/3236], Loss: 1.8199, Perplexity: 6.1714
Epoch [2/5], Step [2580/3236], Loss: 1.9884, Perplexity: 7.3035
Epoch [2/5], Step [2590/3236], Loss: 1.9812, Perplexity: 7.2514
Epoch [2/5], Step [2600/3236], Loss: 1.9954, Perplexity: 7.3553
Epoch [2/5], Step [2610/3236], Loss: 2.1358, Perplexity: 8.4637
Epoch [2/5], Step [2620/3236], Loss: 1.9617, Perplexity: 7.1113
Epoch [2/5], Step [2630/3236], Loss: 1.9563, Perplexity: 7.0733
Epoch [2/5], Step [2640/3236], Loss: 2.0707, Perplexity: 7.9304
Epoch [2/5], Step [2650/3236], Loss: 2.0198, Perplexity: 7.5365
Epoch [2/5], Step [2660/3236], Loss: 2.0577, Perplexity: 7.8277
Epoch [2/5], Step [2670/3236], Loss: 2.0

Epoch [3/5], Step [580/3236], Loss: 1.8707, Perplexity: 6.4927
Epoch [3/5], Step [590/3236], Loss: 1.7555, Perplexity: 5.7863
Epoch [3/5], Step [600/3236], Loss: 1.8388, Perplexity: 6.2891
Epoch [3/5], Step [610/3236], Loss: 1.9408, Perplexity: 6.9646
Epoch [3/5], Step [620/3236], Loss: 1.9399, Perplexity: 6.9580
Epoch [3/5], Step [630/3236], Loss: 1.8508, Perplexity: 6.3648
Epoch [3/5], Step [640/3236], Loss: 1.8379, Perplexity: 6.2832
Epoch [3/5], Step [650/3236], Loss: 1.7673, Perplexity: 5.8551
Epoch [3/5], Step [660/3236], Loss: 1.9311, Perplexity: 6.8971
Epoch [3/5], Step [670/3236], Loss: 1.8684, Perplexity: 6.4776
Epoch [3/5], Step [680/3236], Loss: 1.8523, Perplexity: 6.3746
Epoch [3/5], Step [690/3236], Loss: 1.9509, Perplexity: 7.0347
Epoch [3/5], Step [700/3236], Loss: 1.9160, Perplexity: 6.7938
Epoch [3/5], Step [710/3236], Loss: 1.8813, Perplexity: 6.5621
Epoch [3/5], Step [720/3236], Loss: 1.9371, Perplexity: 6.9388
Epoch [3/5], Step [730/3236], Loss: 1.8326, Perplexity:

Epoch [3/5], Step [1870/3236], Loss: 1.8695, Perplexity: 6.4850
Epoch [3/5], Step [1880/3236], Loss: 1.9073, Perplexity: 6.7350
Epoch [3/5], Step [1890/3236], Loss: 1.8712, Perplexity: 6.4959
Epoch [3/5], Step [1900/3236], Loss: 1.8483, Perplexity: 6.3491
Epoch [3/5], Step [1910/3236], Loss: 1.9054, Perplexity: 6.7218
Epoch [3/5], Step [1920/3236], Loss: 1.9446, Perplexity: 6.9906
Epoch [3/5], Step [1930/3236], Loss: 1.9718, Perplexity: 7.1835
Epoch [3/5], Step [1940/3236], Loss: 1.9725, Perplexity: 7.1889
Epoch [3/5], Step [1950/3236], Loss: 1.8449, Perplexity: 6.3275
Epoch [3/5], Step [1960/3236], Loss: 1.8302, Perplexity: 6.2352
Epoch [3/5], Step [1970/3236], Loss: 1.8940, Perplexity: 6.6460
Epoch [3/5], Step [1980/3236], Loss: 1.8010, Perplexity: 6.0559
Epoch [3/5], Step [1990/3236], Loss: 1.9487, Perplexity: 7.0196
Epoch [3/5], Step [2000/3236], Loss: 1.8530, Perplexity: 6.3792
Epoch [3/5], Step [2010/3236], Loss: 2.0282, Perplexity: 7.6002
Epoch [3/5], Step [2020/3236], Loss: 1.8

Epoch [3/5], Step [3160/3236], Loss: 2.0365, Perplexity: 7.6640
Epoch [3/5], Step [3170/3236], Loss: 1.8792, Perplexity: 6.5481
Epoch [3/5], Step [3180/3236], Loss: 1.8888, Perplexity: 6.6114
Epoch [3/5], Step [3190/3236], Loss: 1.9623, Perplexity: 7.1156
Epoch [3/5], Step [3200/3236], Loss: 1.9113, Perplexity: 6.7616
Epoch [3/5], Step [3210/3236], Loss: 2.0712, Perplexity: 7.9342
Epoch [3/5], Step [3220/3236], Loss: 1.9748, Perplexity: 7.2049
Epoch [3/5], Step [3230/3236], Loss: 1.8391, Perplexity: 6.2906


OSError: Traceback (most recent call last):
  File "/home/zhihao/miniconda2/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 40, in _worker_loop
  File "/home/zhihao/miniconda2/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 40, in <listcomp>
  File "<ipython-input-6-7e1e986e5b47>", line 27, in __getitem__
    image = Image.open(os.path.join(self.root, path)).convert('RGB')
  File "/home/zhihao/miniconda2/lib/python3.6/site-packages/PIL/Image.py", line 2519, in open
OSError: cannot identify image file 'pytorch-tutorial/tutorials/03-advanced/image_captioning/data/train_resized2014/COCO_train2014_000000510671.jpg'
