## Project: Image Captioning

---

- [Step 1](#step1): Training Setup
- [Step 2](#step2): Train your Model

<a id='step1'></a>
## Step 1: Training Setup


In [6]:
import torch
import torch.nn as nn
from torchvision import transforms
import sys
from data_loader import get_loader
from model import EncoderCNN, DecoderRNN
import math


# Hyperparameters.
batch_size = 64          # batch size
vocab_threshold = 5        # minimum word count threshold
vocab_from_file = True    # if True, load existing vocab file
embed_size = 256         # dimensionality of image and word embeddings
hidden_size = 512          # number of features in hidden state of the RNN decoder
num_epochs = 2             # number of training epochs
save_every = 1             # determines frequency of saving model weights
print_every = 100          # determines window for printing average loss
log_file = 'training_log.txt'       # name of file with saved training loss and perplexity

# Transform Images.
transform_train = transforms.Compose([ 
    transforms.Resize(256),                          # smaller edge of image resized to 256
    transforms.RandomCrop(224),                      # get 224x224 crop from random location
    transforms.RandomHorizontalFlip(),               # horizontally flip image with probability=0.5
    transforms.ToTensor(),                           # convert the PIL Image to a tensor
    transforms.Normalize((0.485, 0.456, 0.406),      # normalize image for pre-trained model
                         (0.229, 0.224, 0.225))])

# Build data loader.
data_loader = get_loader(transform=transform_train,
                         mode='train',
                         batch_size=batch_size,
                         vocab_threshold=vocab_threshold,
                         vocab_from_file=vocab_from_file)

# Get the vocab size.
vocab_size = len(data_loader.dataset.vocab)

# Initialize the encoder and decoder. 
encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

# Move models to GPU if CUDA is available. 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder.to(device)
decoder.to(device)

# Define the loss function. 
criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss()

# Specify the learnable parameters of the model.
params = list(decoder.parameters()) + list(encoder.embed.parameters())

# Define the optimizer.
optimizer = torch.optim.Adam(params, lr=0.001)

# Set the total number of training steps per epoch.
total_step = math.ceil(len(data_loader.dataset.caption_lengths) / data_loader.batch_sampler.batch_size) #100 captions / 32 batch_size = 3.125 -> 4

Number of images: 8091
Number of captions: 8091
Obtaining caption lengths...
Obtaining caption lengths...Done and caption lengths: 8091


In [2]:
print(total_step)

127


<a id='step2'></a>
## Step 2: Train your Model


In [8]:
import torch.utils.data as data
import numpy as np
import os

# Open the training log file.
f = open(log_file, 'w')

for epoch in range(1, num_epochs+1):
    
    # Randomly sample a caption length, and sample indices with that length.
    sampled_indices = data_loader.dataset.get_train_indices()
    # Create and assign a batch sampler to retrieve a batch with the sampled indices.
    new_sampler = data.sampler.SubsetRandomSampler(indices=sampled_indices)
    data_loader.batch_sampler.sampler = new_sampler
    
    for i_step in range(1, total_step+1):
        
        # Obtain the batch.
        images, captions = next(iter(data_loader))

        # Move batch of images and captions to GPU if CUDA is available.
        images = images.to(device)
        captions = captions.to(device)
        
        # Zero the gradients.
        decoder.zero_grad()
        encoder.zero_grad()
        
        # Pass the inputs through the CNN-RNN model.
        features = encoder(images)
        outputs = decoder(features, captions)
        
        # Calculate the batch loss.
        loss = criterion(outputs.view(-1, vocab_size), captions.view(-1))
        
        # Backward pass.
        loss.backward()
        
        # Update the parameters in the optimizer.
        optimizer.step()
            
        # Get training statistics.
        stats = 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' %(epoch, num_epochs, i_step, total_step, loss.item(), np.exp(loss.item()))
                                                         #perplecity = exp(loss) -> 2.718^loss means how many times the model is confused to predict the next word 
        
        # Print training statistics (on same line).
        print('\r' + stats, end="")
        sys.stdout.flush() #flush the buffer After printing the output, we need to flush the buffer to ensure that the output is printed immediately.
        
        # Print training statistics to file. 
        f.write(stats + '\n')
        f.flush() #flush the buffer #flush() method is used to clear the internal buffer of a file object.
        
        # Print training statistics (on different line).
        if i_step % print_every == 0:
            print('\r' + stats)
            
    # Save the weights.
    if epoch % save_every == 0:
        torch.save(decoder.state_dict(), os.path.join('./models', 'decoder-%d.pkl' %epoch))
        torch.save(encoder.state_dict(), os.path.join('./models', 'encoder-%d.pkl' %epoch))

# Close the training log file.
f.close()

Epoch [1/2], Step [100/127], Loss: 1.5349, Perplexity: 4.64099
Epoch [2/2], Step [100/127], Loss: 0.3695, Perplexity: 1.4470
Epoch [2/2], Step [127/127], Loss: 0.2428, Perplexity: 1.2749