# Deep Learning Course Project

## Project: Image Captioning

---


- [Step 1](#step1): Explore the Data Loader
- [Step 2](#step2): Use the Data Loader to Obtain Batches
- [Step 3](#step3): Experiment with the CNN Encoder
- [Step 4](#step4): Implement the RNN Decoder

<a id='step1'></a>
## Step 1: Explore the Data Loader


In [1]:

import nltk
#nltk.download('all') # Download all the resources for NLTK to avoid errors 
from data_loader import get_loader
from torchvision import transforms

# Define a transform to pre-process the training images.
transform_train = transforms.Compose([ 
    transforms.Resize(256),                          # smaller edge of image resized to 256
    transforms.RandomCrop(224),                      # get 224x224 crop from random location
    transforms.RandomHorizontalFlip(),               # horizontally flip image with probability=0.5
    transforms.ToTensor(),                           # convert the PIL Image to a tensor
    transforms.Normalize((0.485, 0.456, 0.406),      # normalize image for pre-trained model
                         (0.229, 0.224, 0.225))])

# Set the minimum word count threshold.
vocab_threshold = 5

# Specify the batch size.
batch_size = 10

# Obtain the data loader.
data_loader = get_loader(transform=transform_train,
                         mode='train',
                         batch_size=batch_size,
                         vocab_threshold=vocab_threshold,
                         vocab_from_file=False)

Number of images: 8091
Number of captions: 8091
Obtaining caption lengths...
Obtaining caption lengths...Done and caption lengths: 8091


In [2]:
sample_caption = 'A person doing a trick on a rail while riding a skateboard.'

In [3]:
import nltk

sample_tokens = nltk.tokenize.word_tokenize(str(sample_caption).lower())
print(sample_tokens)

['a', 'person', 'doing', 'a', 'trick', 'on', 'a', 'rail', 'while', 'riding', 'a', 'skateboard', '.']


In [12]:
sample_caption = []

start_word = data_loader.dataset.vocab.start_word
print('Special start word:', start_word)
sample_caption.append(data_loader.dataset.vocab(start_word))
print(sample_caption)

Special start word: <start>
[0]


In [13]:
sample_caption.extend([data_loader.dataset.vocab(token) for token in sample_tokens])
print(sample_caption)

[0, 3, 179, 397, 3, 394, 38, 3, 388, 77, 593, 3, 392, 16]


In [14]:
end_word = data_loader.dataset.vocab.end_word
print('Special end word:', end_word)

sample_caption.append(data_loader.dataset.vocab(end_word))
print(sample_caption)

Special end word: <end>
[0, 3, 179, 397, 3, 394, 38, 3, 388, 77, 593, 3, 392, 16, 1]


In [16]:
import torch
# for example, if the max_len=20 
max_len = 20
padded_sample_caption = sample_caption + [2] * (max_len - len(sample_caption))
padded_sample_caption = torch.Tensor(padded_sample_caption).long()
print(padded_sample_caption)

tensor([  0,   3, 179, 397,   3, 394,  38,   3, 388,  77, 593,   3, 392,  16,
          1,   2,   2,   2,   2,   2])


In [18]:
# Preview the word2idx dictionary.
dict(list(data_loader.dataset.vocab.word2idx.items())[:20])

{'<start>': 0,
 '<end>': 1,
 '<unk>': 2,
 'a': 3,
 'child': 4,
 'in': 5,
 'pink': 6,
 'dress': 7,
 'is': 8,
 'climbing': 9,
 'up': 10,
 'set': 11,
 'of': 12,
 'stairs': 13,
 'an': 14,
 'way': 15,
 '.': 16,
 'girl': 17,
 'going': 18,
 'into': 19}

In [20]:
# Print the total number of keys in the word2idx dictionary.
print('Total number of tokens in vocabulary:', len(data_loader.dataset.vocab))

Total number of tokens in vocabulary: 3004


In [21]:
# Modify the minimum word count threshold.
vocab_threshold = 4

# Obtain the data loader.
data_loader = get_loader(transform=transform_train,
                         mode='train',
                         batch_size=batch_size,
                         vocab_threshold=vocab_threshold,
                         vocab_from_file=False)

Number of images: 8091
Number of captions: 8091
Obtaining caption lengths...
Obtaining caption lengths...Done and caption lengths: 8091


In [22]:
# Print the total number of keys in the word2idx dictionary.
print('Total number of tokens in vocabulary:', len(data_loader.dataset.vocab))

Total number of tokens in vocabulary: 3444


In [23]:
unk_word = data_loader.dataset.vocab.unk_word
print('Special unknown word:', unk_word)

print('All unknown words are mapped to this integer:', data_loader.dataset.vocab(unk_word))

Special unknown word: <unk>
All unknown words are mapped to this integer: 2


In [24]:
print(data_loader.dataset.vocab('jfkafejw'))
print(data_loader.dataset.vocab('ieowoqjf'))

2
2


In [25]:
# Obtain the data loader (from file). Note that it runs much faster than before!
data_loader = get_loader(transform=transform_train,
                         mode='train',
                         batch_size=batch_size,
                         vocab_from_file=True)

Number of images: 8091
Number of captions: 8091
Obtaining caption lengths...
Obtaining caption lengths...Done and caption lengths: 8091


<a id='step2'></a>
## Step 2: Use the Data Loader to Obtain Batches


In [26]:
from collections import Counter

# Tally the total number of training captions with each length.
counter = Counter(data_loader.dataset.caption_lengths)
lengths = sorted(counter.items(), key=lambda pair: pair[1], reverse=True)
for value, count in lengths:
    print('value: %2d --- count: %5d' % (value, count))

value: 72 --- count:   320
value: 73 --- count:   313
value: 77 --- count:   296
value: 74 --- count:   294
value: 70 --- count:   293
value: 66 --- count:   289
value: 71 --- count:   281
value: 69 --- count:   277
value: 67 --- count:   273
value: 75 --- count:   266
value: 68 --- count:   264
value: 78 --- count:   235
value: 79 --- count:   229
value: 81 --- count:   223
value: 80 --- count:   223
value: 76 --- count:   223
value: 65 --- count:   222
value: 64 --- count:   208
value: 63 --- count:   206
value: 83 --- count:   193
value: 82 --- count:   185
value: 85 --- count:   171
value: 62 --- count:   169
value: 84 --- count:   167
value: 60 --- count:   162
value: 61 --- count:   150
value: 86 --- count:   124
value: 59 --- count:   124
value: 57 --- count:   122
value: 87 --- count:   122
value: 58 --- count:   116
value: 89 --- count:    96
value: 88 --- count:    87
value: 90 --- count:    85
value: 55 --- count:    84
value: 91 --- count:    75
value: 56 --- count:    72
v

In [27]:
for i in range(10):
    img, caption = data_loader.dataset[i]
    print(f"Image {i}: {img}")
    print(f"Caption {i}: {caption}")


Image 0: tensor([[[-0.8335, -0.9020, -0.9705,  ...,  2.2489,  2.2147,  2.1290],
         [-1.5014, -1.0562, -1.0219,  ...,  2.2489,  2.1804,  2.1119],
         [-1.6213, -1.3644, -1.1418,  ...,  2.2318,  2.2489,  2.1975],
         ...,
         [-0.8678, -1.3815, -1.2788,  ..., -0.6794, -0.7308, -0.9705],
         [-0.7822, -1.1418, -1.1589,  ..., -0.0116,  0.1083, -0.0629],
         [-1.3302, -0.8849, -0.9877,  ...,  0.5193,  0.0569, -0.1999]],

        [[-1.0028, -1.1604, -1.1604,  ...,  2.4111,  2.3060,  2.2010],
         [-1.3529, -1.1954, -1.1954,  ...,  2.4111,  2.2710,  2.1835],
         [-1.3354, -1.2304, -1.0903,  ...,  2.3936,  2.3936,  2.3761],
         ...,
         [-0.9328, -1.4405, -1.4755,  ..., -0.4251, -0.4426, -0.7927],
         [-0.9328, -1.2129, -1.2829,  ...,  0.1352,  0.1702, -0.0399],
         [-1.5455, -1.1604, -1.1078,  ...,  0.2402, -0.1975, -0.2150]],

        [[-1.4559, -1.4733, -1.4733,  ...,  1.6640,  1.3328,  1.0714],
         [-1.4559, -1.4036, -1.5081,

In [28]:
images, captions = next(iter(data_loader))

In [31]:
import numpy as np
import torch.utils.data as data

# Randomly sample a caption length, and sample indices with that length.
indices = data_loader.dataset.get_train_indices()
print('sampled indices:', indices)

# Create and assign a batch sampler to retrieve a batch with the sampled indices.
new_sampler = data.sampler.SubsetRandomSampler(indices=indices)
data_loader.batch_sampler.sampler = new_sampler
    
# Obtain the batch.
images, captions = next(iter(data_loader))
    
print('images.shape:', images.shape)
print('captions.shape:', captions.shape)

# (Optional) Uncomment the lines of code below to print the pre-processed images and captions.
print('images:', images)
print('captions:', captions)

sampled indices: [np.int64(7887), np.int64(1487), np.int64(4600), np.int64(6489), np.int64(6848), np.int64(2836), np.int64(7739), np.int64(829), np.int64(889), np.int64(1947)]
images.shape: torch.Size([10, 3, 224, 224])
captions.shape: torch.Size([10, 85])
images: tensor([[[[-2.1008, -2.1179, -2.0837,  ..., -1.4843, -1.5014, -1.5185],
          [-2.1179, -2.0837, -2.1008,  ..., -1.5185, -1.4843, -1.4329],
          [-2.0837, -2.1008, -2.1008,  ..., -1.5014, -1.3987, -1.3815],
          ...,
          [-1.9809, -2.0152, -2.0323,  ..., -1.6042, -1.2617, -0.9020],
          [-1.9809, -2.0152, -1.9980,  ..., -1.6213, -1.2788, -0.9363],
          [-1.9980, -1.9809, -1.9980,  ..., -1.6213, -1.2445, -0.9192]],

         [[-2.0182, -2.0357, -2.0007,  ..., -1.3880, -1.4055, -1.4055],
          [-2.0357, -2.0007, -2.0182,  ..., -1.4055, -1.3704, -1.3179],
          [-2.0007, -2.0182, -2.0182,  ..., -1.4055, -1.3004, -1.2829],
          ...,
          [-1.8957, -1.9482, -1.9482,  ..., -1.4405, -1

<a id='step3'></a>
## Step 3: Experiment with the CNN Encoder


In [38]:
# Watch for any changes in model.py, and re-load it automatically.
#%load_ext autoreload
#%autoreload 2   #1: Reloads only the originally executed script (the notebook in this case). 2 (default): Reloads all modules imported from the originally executed script. This is more convenient as changes in imported modules (like model.py in this case) will also be reflected.

# Import EncoderCNN and DecoderRNN. 
from model import EncoderCNN, DecoderRNN

In [40]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [41]:
# Specify the dimensionality of the image embedding.
embed_size = 256

# Initialize the encoder. 
encoder = EncoderCNN(embed_size)

# Move the encoder to GPU if CUDA is available.
encoder.to(device)
    
# Move last batch of images (from Step 2) to GPU if CUDA is available.   
images = images.to(device)

# Pass the images through the encoder.
features = encoder(images)

print('type(features):', type(features))
print('features.shape:', features.shape)

# Check that your encoder satisfies some requirements of the project! :D
assert type(features)==torch.Tensor, "Encoder output needs to be a PyTorch Tensor." 
assert (features.shape[0]==batch_size) & (features.shape[1]==embed_size), "The shape of the encoder output is incorrect."

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to C:\Users\Marey/.cache\torch\hub\checkpoints\resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:49<00:00, 2.08MB/s]


type(features): <class 'torch.Tensor'>
features.shape: torch.Size([10, 256])


<a id='step4'></a>
## Step 4: Implement the RNN Decoder


In [42]:
# Specify the number of features in the hidden state of the RNN decoder.
hidden_size = 512 # size of the hidden layer of neurons
num_layers = 1

# Store the size of the vocabulary.
vocab_size = len(data_loader.dataset.vocab)

# Initialize the decoder.
decoder = DecoderRNN(embed_size, hidden_size, vocab_size,num_layers)

# Move the decoder to GPU if CUDA is available.
decoder.to(device)
    
# Move last batch of captions (from Step 1) to GPU if CUDA is available 
captions = captions.to(device)

# Pass the encoder output and captions through the decoder.
outputs = decoder(features, captions)

print('type(outputs):', type(outputs))
print('outputs.shape:', outputs.shape)

# Check that your decoder satisfies some requirements of the project! :D
assert type(outputs)==torch.Tensor, "Decoder output needs to be a PyTorch Tensor."
assert (outputs.shape[0]==batch_size) & (outputs.shape[1]==captions.shape[1]) & (outputs.shape[2]==vocab_size), "The shape of the decoder output is incorrect."

type(outputs): <class 'torch.Tensor'>
outputs.shape: torch.Size([10, 85, 3444])



![Encoder->Decoder](./encoder_decoder_diag.jpg)