# Train the Model

## Import Libraries

In [15]:
import torch
import torch.nn as nn
import torch.utils.data as data
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.tensorboard import SummaryWriter
from models import Encoder, Decoder

from pathlib import Path
from DatasetInterface import MSCOCOInterface
import json
import numpy as np
import time

###### run code below if nltk hasn't been set up in clound instance yet
# !python -m nltk.downloader -d /usr/local/share/nltk_data all

###### run code below to save pre-trained weights if needed
# !wget https://download.pytorch.org/models/resnet152-394f9c45.pth
# !mv resnet152-394f9c45.pth resnet152_model.pth

## Load Dataset Interface and DataLoader

In [16]:
#########
# paths for Khalil
#########
# root = Path('Data')

#captions_path = root/'annotations'/'captions_train2017.json'

# train_captions_path = root/'annotations_trainval2017'/'annotations'/'sports_captions_train.json'
# val_captions_path = root/'annotations_trainval2017'/'annotations'/'sports_captions_val.json'
# test_captions_path = root/'annotations_trainval2017'/'annotations'/'sports_captions_test.json'

#########
# paths for Alex
#########

root = Path('Datasets/coco')
imgs_path = root/'images'/'train2017'
imgs_path_test = root/'images'/'val2017'

train_captions_path = root/'annotations'/'sports_captions_train.json'
val_captions_path = root/'annotations'/'sports_captions_val.json'
test_captions_path = root/'annotations'/'sports_captions_test.json'

"""
# load vocab
with open('vocabulary/idx_to_string.json') as json_file:
    idx_to_string_json = json.load(json_file)
        
idx_to_string = dict()
for key in idx_to_string_json:
    idx_to_string[int(key)] = idx_to_string_json[key]
    
with open('vocabulary/string_to_index.json') as json_file:
    string_to_index = json.load(json_file)
"""

"\n# load vocab\nwith open('vocabulary/idx_to_string.json') as json_file:\n    idx_to_string_json = json.load(json_file)\n        \nidx_to_string = dict()\nfor key in idx_to_string_json:\n    idx_to_string[int(key)] = idx_to_string_json[key]\n    \nwith open('vocabulary/string_to_index.json') as json_file:\n    string_to_index = json.load(json_file)\n"

In [17]:
# to boost the performence of CUDA use:
# torch.backends.cudnn.benchmark = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [18]:
train_interface_params = {
    'imgs_path': imgs_path,
    'captions_path': train_captions_path,
    'freq_threshold': 5,
    'sequence_length': 20,
    'caps_per_img': 1,
    'stage': "train",
    'idx_to_string': None,
    'string_to_index': None,
}

val_interface_params = {
    'imgs_path': imgs_path,
    'captions_path': val_captions_path,
    'freq_threshold': 5,
    'sequence_length': 20,
    'caps_per_img': 1,
    'stage': "validation",
    'idx_to_string': None,
    'string_to_index': None,
}

test_interface_params = {
    'imgs_path': imgs_path_test,
    'captions_path': test_captions_path,
    'freq_threshold': 5,
    'sequence_length': 20,
    'caps_per_img': 1,
    'stage': "test",
    'idx_to_string': None,
    'string_to_index': None,
}


# Training Interface
coco_interface_train = MSCOCOInterface(**train_interface_params)

# Validation Interface
coco_interface_val = MSCOCOInterface(**val_interface_params)

# Testing Interface
coco_interface_test = MSCOCOInterface(**test_interface_params)


In [19]:
print("Lenght of training image: {}, Lenght of Validation image: {} Lenght of Testing image: {}"\
      .format(len(coco_interface_train), len(coco_interface_val), len(coco_interface_test)))

print(f"Lenght of vocabulary: {len(coco_interface_train.idx_to_string)}")

Lenght of training image: 500, Lenght of Validation image: 100 Lenght of Testing image: 4939
Lenght of vocabulary: 619


In [20]:
batch_size = 1
train_loader = data.DataLoader(coco_interface_train, batch_size=batch_size, shuffle=True)
val_loader = data.DataLoader(coco_interface_val, batch_size=batch_size, shuffle=False)
test_loader = data.DataLoader(coco_interface_test, batch_size=batch_size, shuffle=False)

## Parameters

In [21]:
embed_size = 512
hidden_size = 512
vocab_size = len(coco_interface_train.idx_to_string)
num_layers = 1

## Encoder and Decoder

In [22]:
encoder = Encoder(embed_size=embed_size, pretrained=False, model_weight_path="./model/resnet152_model.pth")
decoder = Decoder(embed_size=embed_size, hidden_size=hidden_size, vocab_size=vocab_size, num_layers=num_layers)
print("########################################READY########################################")

########################################READY########################################


In [23]:
# the loss is a cross entropy loss and ignore the index of <PAD> since it doesn't make any difference
criterion = nn.CrossEntropyLoss(ignore_index=coco_interface_train.string_to_index["<PAD>"])

# combine the paramters of decoder and ecnoder
params = list(decoder.parameters()) + list(encoder.embed.parameters())

# Adam optimizer
opt_pars = {'lr':1e-5, 'weight_decay':1e-3, 'betas':(0.9, 0.999), 'eps':1e-08}
optimizer = optim.Adam(params, **opt_pars)

## Train

In [24]:
def save_model(epoch, encoder, decoder, training_loss, validation_loss, checkpoint_path):
    torch.save({
        'epoch': epoch,
        'encoder_state_dict': encoder.state_dict(),
        'decoder_state_dict': decoder.state_dict(),
        'training_loss': training_loss,
        'validation_loss': validation_loss
    }, checkpoint_path)

def load_model(encoder, decoder, checkpoint_path):

    checkpoint = torch.load(checkpoint_path)
    encoder.load_state_dict(checkpoint['encoder_state_dict'])
    decoder.load_state_dict(checkpoint['decoder_state_dict'])
    training_loss = checkpoint['training_loss']
    validation_loss = checkpoint['validation_loss']

    return encoder, decoder, training_loss, validation_loss


In [25]:
CHECKPOINT = './model/image_captioning_model_v1.pth'
if Path(CHECKPOINT).exists():
    encoder, decoder, training_loss, validation_loss = load_model(encoder, decoder, CHECKPOINT)
else:
    print(f'{CHECKPOINT} file does not exist, training startging from scratch')

./model/image_captioning_model_v1.pth file does not exist, training startging from scratch


In [30]:
def train(encoder, decoder, criterion, optimizer, train_loader, val_loader, total_epoch, checkpoint_path):
    
    encoder.to(device)
    decoder.to(device)
    
    training_loss = []
    validation_loss = []
    
    start_time = time.time()
    for epoch in range(total_epoch):
        train_epoch_loss = 0
        val_epoch_loss = 0
        
        
        # Training phase
        encoder.train()
        decoder.train()
        
        for i, batch in enumerate(train_loader):
            idx, images, captions = batch
            images, captions = images.to(device), captions.to(device)
            
            # Zero the gradients.
            encoder.zero_grad()
            decoder.zero_grad()
            
            features = encoder(images)
            outputs = decoder(features, captions)
            
            loss = criterion(outputs.view(-1, vocab_size), captions.contiguous().view(-1))
            
            loss.backward()
            optimizer.step()
        
            train_epoch_loss += loss.item()
            if i % 100 == 0:
                print('Training: ', i, ' ', loss.item())
                
        train_epoch_loss /= len(train_loader)
        training_loss.append(train_epoch_loss)
        
        # validation phase
        encoder.eval()
        decoder.eval()
        
        for id, batch in enumerate(val_loader):
            idx, images, captions = batch
            images, captions = images.to(device), captions.to(device)
            features = encoder(images)
            outputs = decoder(features, captions)
            loss = criterion(outputs.view(-1, vocab_size), captions.contiguous().view(-1))
            val_epoch_loss += loss.item()
            if id % 100 == 0:
                print('Validation: ', id, ' ', loss.item())
            
        val_epoch_loss /= len(val_loader)
        validation_loss.append(val_epoch_loss)
    
        epoch_time = (time.time() - start_time) /60**1

        save_model(epoch, encoder, decoder, training_loss, validation_loss, checkpoint_path)

        print("Epoch: {1:}. Training Loss = {1:.4f}, Training Perplexity: {2:.4f}. Validation Loss: {3:.4f}, Validation Perplexity: {4:.4f}. Time: {5:f}" \
          .format(epoch, train_epoch_loss, np.exp(train_epoch_loss), val_epoch_loss, np.exp(val_epoch_loss), epoch_time))
    
    return training_loss, validation_loss
    

In [None]:
train_params = {
    'encoder': encoder,
    'decoder': decoder,
    'criterion': criterion,
    'optimizer': optimizer,
    'train_loader': train_loader,
    'val_loader': val_loader,
    'total_epoch': 10,
    'checkpoint_path': './model/image_captioning_model_v0.pth'
}

training_loss, validation_loss = train(**train_params) 

Training:  0   3.445712089538574
Training:  100   4.557940483093262
Training:  200   4.148390293121338
Training:  300   4.806350231170654
Training:  400   4.394772529602051
Validation:  0   3.6761958599090576
Epoch: 4.1024859943389895. Training Loss = 4.1025, Training Perplexity: 60.4905. Validation Loss: 4.2559, Validation Perplexity: 70.5187. Time: 0.523141
Training:  0   4.025753974914551
Training:  100   4.109526634216309
Training:  200   4.796710014343262
Training:  300   3.686384439468384
Training:  400   4.2791852951049805
Validation:  0   3.5996787548065186
Epoch: 3.924295334815979. Training Loss = 3.9243, Training Perplexity: 50.6174. Validation Loss: 4.2515, Validation Perplexity: 70.2116. Time: 0.960140
Training:  0   3.89064621925354
Training:  100   3.271575689315796
Training:  200   3.811802387237549
Training:  300   3.7648942470550537
Training:  400   3.4575552940368652
Validation:  0   3.5623321533203125
Epoch: 3.8280628042221068. Training Loss = 3.8281, Training Perple