# Train the Model

## Import Libraries

In [15]:
import torch
import torch.nn as nn
import torch.utils.data as data
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.tensorboard import SummaryWriter
from models import Encoder, Decoder

from pathlib import Path
from DatasetInterface import MSCOCOInterface
from utils import train, save_model, load_model, plot_loss
import json
import numpy as np
import time

###### run code below if nltk hasn't been set up in clound instance yet
# !python -m nltk.downloader -d /usr/local/share/nltk_data all

###### run code below to save pre-trained weights if needed
# !wget https://download.pytorch.org/models/resnet152-394f9c45.pth
# !mv resnet152-394f9c45.pth resnet152_model.pth

## Load Dataset Interface and DataLoader

In [16]:
#########
# paths for Khalil
#########
# root = Path('Data')

#captions_path = root/'annotations'/'captions_train2017.json'

# train_captions_path = root/'annotations_trainval2017'/'annotations'/'sports_captions_train.json'
# val_captions_path = root/'annotations_trainval2017'/'annotations'/'sports_captions_val.json'
# test_captions_path = root/'annotations_trainval2017'/'annotations'/'sports_captions_test.json'

#########
# paths for Alex
#########

root = Path('Datasets/coco')
imgs_path = root/'images'/'train2017'
imgs_path_test = root/'images'/'val2017'

train_captions_path = root/'annotations'/'sports_captions_train.json'
val_captions_path = root/'annotations'/'sports_captions_val.json'
test_captions_path = root/'annotations'/'sports_captions_test.json'

"""
# load vocab
with open('vocabulary/idx_to_string.json') as json_file:
    idx_to_string_json = json.load(json_file)
        
idx_to_string = dict()
for key in idx_to_string_json:
    idx_to_string[int(key)] = idx_to_string_json[key]
    
with open('vocabulary/string_to_index.json') as json_file:
    string_to_index = json.load(json_file)
"""

"\n# load vocab\nwith open('vocabulary/idx_to_string.json') as json_file:\n    idx_to_string_json = json.load(json_file)\n        \nidx_to_string = dict()\nfor key in idx_to_string_json:\n    idx_to_string[int(key)] = idx_to_string_json[key]\n    \nwith open('vocabulary/string_to_index.json') as json_file:\n    string_to_index = json.load(json_file)\n"

In [17]:
# to boost the performence of CUDA use:
# torch.backends.cudnn.benchmark = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [18]:
train_interface_params = {
    'imgs_path': imgs_path,
    'captions_path': train_captions_path,
    'freq_threshold': 5,
    'sequence_length': 20,
    'caps_per_img': 1,
    'mode': 'train',
    'idx_to_string': None,
    'string_to_index': None,
    "seed": 706
}

val_interface_params = {
    'imgs_path': imgs_path,
    'captions_path': val_captions_path,
    'freq_threshold': 6,
    'sequence_length': 20,
    'caps_per_img': 1,
    'mode': 'validation',
    'idx_to_string': None,
    'string_to_index': None,
    "seed": 706
}

# Training Interface
coco_interface_train = MSCOCOInterface(**train_interface_params)

# Validation Interface
coco_interface_val = MSCOCOInterface(**val_interface_params)

In [19]:
print("Length of training image: {}, Length of Validation image: {}"\
      .format(len(coco_interface_train), len(coco_interface_val)))

print(f"Length of vocabulary: {len(coco_interface_train.idx_to_string)}")

Lenght of training image: 500, Lenght of Validation image: 100 Lenght of Testing image: 4939
Lenght of vocabulary: 619


In [20]:
batch_size = 32
train_loader = data.DataLoader(coco_interface_train, batch_size=batch_size, shuffle=True)
val_loader = data.DataLoader(coco_interface_val, batch_size=batch_size, shuffle=False)

## Parameters

In [21]:
embed_size = 512
hidden_size = 512
vocab_size = len(coco_interface_train.idx_to_string)
num_layers = 1

## Encoder and Decoder

In [22]:
encoder = Encoder(embed_size=embed_size, pretrained=False, model_weight_path="./model/resnet152_model.pth")
decoder = Decoder(embed_size=embed_size, hidden_size=hidden_size, vocab_size=vocab_size, num_layers=num_layers)
print("########################################READY########################################")

########################################READY########################################


In [23]:
# the loss is a cross entropy loss and ignore the index of <PAD> since it doesn't make any difference
criterion = nn.CrossEntropyLoss(ignore_index=coco_interface_train.string_to_index["<PAD>"])

# combine the parameters of decoder and encoder
params = list(decoder.parameters()) + list(encoder.embed.parameters())

# Adam optimizer
opt_pars = {'lr':1e-3, 'weight_decay':1e-3, 'betas':(0.9, 0.999), 'eps':1e-08}
optimizer = optim.Adam(params, **opt_pars)

## Train

In [None]:
train_params = {
    'encoder': encoder,
    'decoder': decoder,
    'criterion': criterion,
    'optimizer': optimizer,
    'train_loader': train_loader,
    'val_loader': val_loader,
    'total_epoch': 10,
    'checkpoint_path': './model/image_captioning_model_v0.pth'
}

training_loss, validation_loss = train(**train_params) 

Training:  0   4.385537147521973
Training:  100   2.824275255203247
Training:  200   4.237329483032227
Training:  300   4.2739362716674805
Training:  400   3.1215097904205322
Validation:  0   3.4859657287597656
Epoch: 0. Training Loss = 3.4319, Training Perplexity: 30.9363. Validation Loss: 4.3009, Validation Perplexity: 73.7627. Time: 0.593844
Training:  0   3.7339859008789062
Training:  100   4.200594425201416
Training:  200   3.744450092315674
Training:  300   3.726269483566284
Training:  400   3.8239715099334717
Validation:  0   3.493844985961914
Epoch: 1. Training Loss = 3.3969, Training Perplexity: 29.8704. Validation Loss: 4.3056, Validation Perplexity: 74.1118. Time: 1.006393
Training:  0   3.647059440612793
Training:  100   3.0379064083099365
Training:  200   2.3731160163879395
Training:  300   3.128211259841919
Training:  400   2.9506795406341553
Validation:  0   3.4830162525177
Epoch: 2. Training Loss = 3.3635, Training Perplexity: 28.8893. Validation Loss: 4.3214, Validatio