## Training



Batch size = 256  --- We trained on GTX1070 8GB memory and the max batch size is this as we can run the training with 6GB usage of gpu

Vocab_threshold = 5  --- As one of the suggestion from the handout we just pick the 5

Embed_size = Hidden_size = 512  --- This variable we have looked over mulitple similar project on git and compared their performance. Based on 

our hardware performance, we decided to choose a higher value for acurrate result as 512.

In [1]:
#quick test before real train

import torch
import torch.nn as nn
from torchvision import transforms
import sys
sys.path.append('../cocoapi/PythonAPI')
from pycocotools.coco import COCO
from data_loader import get_loader
from model import EncoderCNN, DecoderRNN
import math
import os

batch_size = 256           # batch size
vocab_threshold = 5        # minimum word count threshold
vocab_from_file = True     # if True, load existing vocab file
embed_size = 512           # dimensionality of image and word embeddings
hidden_size = 512          # number of features in hidden state of the RNN decoder
num_epochs = 3             # number of training epochs
save_every = 1             # determines frequency of saving model weights
print_every = 100          # determines window for printing average loss
log_file = 'training_log.txt'       # name of file with saved training loss and perplexity

#Amend the image transform below.
transform_train = transforms.Compose([ 
    transforms.Resize(256),                          # smaller edge of image resized to 256
    transforms.RandomCrop(224),                      # get 224x224 crop from random location
    #transforms.ColorJitter(brightness=0.2, contrast=0, saturation=0, hue=0),
    transforms.RandomHorizontalFlip(),               # horizontally flip image with probability=0.5
    transforms.ToTensor(),                           # convert the PIL Image to a tensor
    transforms.Normalize((0.485, 0.456, 0.406),      # normalize image for pre-trained model
                         (0.229, 0.224, 0.225))])

# Build data loader.
data_loader = get_loader(transform=transform_train,
                         mode='train',
                         batch_size=batch_size,
                         vocab_threshold=vocab_threshold,
                         vocab_from_file=vocab_from_file,
                         cocoapi_loc="../")

# The size of the vocabulary.
vocab_size = len(data_loader.dataset.vocab)

# Initialize the encoder and decoder. 
encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

# Move models to GPU if CUDA is available. 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder.to(device)
decoder.to(device)

# Define the loss function. 
criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss()

# Specify the learnable parameters of the model.
params = list(decoder.parameters()) + list(encoder.embed.parameters()) 

# Define the optimizer.
learning_rate = 0.001
optimizer = torch.optim.Adam(params, lr=learning_rate, betas=(0.9, 0.999), eps=1e-08)

# Set the total number of training steps per epoch.
total_step = math.ceil(len(data_loader.dataset.caption_lengths) / data_loader.batch_sampler.batch_size)

Vocabulary successfully loaded from vocab.pkl file!
loading annotations into memory...
Done (t=1.07s)
creating index...
index created!
Obtaining caption lengths...


100%|███████████████████████████████████████████████████████████████████████| 414113/414113 [00:31<00:00, 13093.07it/s]


In [2]:
import sys
print(sys.executable)

C:\Users\wengyunfan\Anaconda3\python.exe


In [4]:
import torch.utils.data as data
import numpy as np
#import os
import requests
import time


print(total_step)
print(num_epochs)

# Open the training log file.
f = open(log_file, 'w')

old_time = time.time()


for epoch in range(1, num_epochs+1):
    print (time.strftime("%H:%M:%S"))
    for i_step in range(1, total_step+1):
        
        if time.time() - old_time > 60:
            old_time = time.time()

        
        # Randomly sample a caption length, and sample indices with that length.
        indices = data_loader.dataset.get_train_indices()
        
        # Create and assign a batch sampler to retrieve a batch with the sampled indices.
        new_sampler = data.sampler.SubsetRandomSampler(indices=indices)
        data_loader.batch_sampler.sampler = new_sampler
        
        # Obtain the batch.
        images, captions = next(iter(data_loader))

        # Move batch of images and captions to GPU if CUDA is available.
        images = images.to(device)
        captions = captions.to(device)
        
        # Zero the gradients.
        decoder.zero_grad()
        encoder.zero_grad()
        
        # Pass the inputs through the CNN-RNN model.
        features = encoder(images)
        outputs = decoder(features, captions)
        
        # Calculate the batch loss.
        loss = criterion(outputs.view(-1, vocab_size), captions.view(-1))
        
        # Backward pass.
        loss.backward()
        
        # Update the parameters in the optimizer.
        optimizer.step()
            
        # Get training statistics.
        stats = 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, num_epochs, i_step, total_step, loss.item(), np.exp(loss.item()))
        
        # Print training statistics (on same line).
        print('\r' + stats, end="")
        sys.stdout.flush()
        
        # Print training statistics to file.
        f.write(stats + '\n')
        f.flush()
        
        # Print training statistics (on different line).
        if i_step % print_every == 0:
            print('\r' + stats)
            
    # Save the weights.
    if epoch % save_every == 0:
        torch.save(decoder.state_dict(), os.path.join('./models',
                                                     'decoder_'+str(epoch)+'_'+
                                                      str(batch_size)+'_'+
                                                      str(learning_rate)+'_'+
                                                      str(num_epochs)+'_'+
                                                      str(total_step)+'_'+
                                                      str(embed_size)+'_'+
                                                      str(hidden_size)+'_'+
                                                      str(vocab_threshold)+'.pkl'))

        torch.save(encoder.state_dict(), os.path.join('./models', 
                                                        'encoder_'+str(epoch)+'_'+
                                                          str(batch_size)+'_'+
                                                          str(learning_rate)+'_'+
                                                          str(num_epochs)+'_'+
                                                          str(total_step)+'_'+
                                                          str(embed_size)+'_'+
                                                          str(hidden_size)+'_'+
                                                          str(vocab_threshold)+'.pkl'))


f.close()

1618
3
21:51:03
Epoch [1/3], Step [100/1618], Loss: 3.8302, Perplexity: 46.0703
Epoch [1/3], Step [200/1618], Loss: 3.4757, Perplexity: 32.32196
Epoch [1/3], Step [300/1618], Loss: 3.3111, Perplexity: 27.4159
Epoch [1/3], Step [400/1618], Loss: 3.2181, Perplexity: 24.9818
Epoch [1/3], Step [500/1618], Loss: 2.9056, Perplexity: 18.2761
Epoch [1/3], Step [600/1618], Loss: 2.8794, Perplexity: 17.8030
Epoch [1/3], Step [700/1618], Loss: 2.9077, Perplexity: 18.3142
Epoch [1/3], Step [800/1618], Loss: 2.7328, Perplexity: 15.3762
Epoch [1/3], Step [900/1618], Loss: 2.6793, Perplexity: 14.57448
Epoch [1/3], Step [1000/1618], Loss: 2.4688, Perplexity: 11.8080
Epoch [1/3], Step [1100/1618], Loss: 2.4819, Perplexity: 11.9637
Epoch [1/3], Step [1200/1618], Loss: 2.4770, Perplexity: 11.9054
Epoch [1/3], Step [1300/1618], Loss: 2.3506, Perplexity: 10.4916
Epoch [1/3], Step [1400/1618], Loss: 2.5041, Perplexity: 12.2324
Epoch [1/3], Step [1500/1618], Loss: 2.3544, Perplexity: 10.5321
Epoch [1/3], Ste

PermissionError: [WinError 5] Access is denied

## BLEU

In [8]:
import data_loader_val
import os


# Build data loader.
val_loader = data_loader_val.get_loader(transform=transform_train,
                         mode='val',
                         batch_size=batch_size,
                         vocab_threshold=vocab_threshold,
                         vocab_from_file=vocab_from_file,
                         cocoapi_loc="../")

# The size of the vocabulary.
vocab_size = len(data_loader.dataset.vocab)

# Initialize the encoder and decoder. 
encoder_val = EncoderCNN(embed_size)
encoder_val.eval()
decoder_val = DecoderRNN(embed_size, hidden_size, vocab_size)
decoder_val.eval()

encoder_file = 'legit_model/encoder_1.pkl' 
decoder_file = 'legit_model/decoder_1.pkl'

!ls ./models/decoder-3.pkl

# Load the trained weights.
encoder_val.load_state_dict(torch.load(os.path.join('./models', encoder_file)))
decoder_val.load_state_dict(torch.load(os.path.join('./models', decoder_file)))

# Move models to GPU if CUDA is available. 
encoder_val.to(device)
decoder_val.to(device)

Vocabulary successfully loaded from vocab.pkl file!


'ls' is not recognized as an internal or external command,
operable program or batch file.


DecoderRNN(
  (embed): Embedding(8856, 512)
  (lstm): LSTM(512, 512, batch_first=True)
  (linear): Linear(in_features=512, out_features=8856, bias=True)
  (dropout): Dropout(p=0.1)
)

In [9]:
def clean_sentence(output):
    result = []
    for part in output[1:]:
        if part == 1:
            break
        result.append(data_loader.dataset.vocab.idx2word[part])
        
    return " ".join(result)

In [13]:
import  sys
sys.path.append('../cocoapi/PythonAPI')
from pycocotools.coco import COCO

annotation_file = '../cocoapi/annotations/captions_val2014.json'

coco_caps = COCO(annotation_file)

def get_image_captions(image_id):
    annIds = coco_caps.getAnnIds(imgIds=image_id);
    anns = coco_caps.loadAnns(annIds)    
    return [item['caption'].split() for item in anns]

loading annotations into memory...
Done (t=0.31s)
creating index...
index created!


In [14]:
import nltk

smoother = nltk.translate.bleu_score.SmoothingFunction()
total = 0.0
for item in captions:
    reference = get_image_captions(item['image_id'])
    hypothesis = item['caption'].split()

    blue_score = nltk.translate.bleu_score.sentence_bleu(reference, hypothesis,smoothing_function=smoother.method4)

    total += blue_score
    
print("Averaged bleu {}".format(total / len(captions)))

Averaged bleu 0.26430586169179154
