# Training Notebook

## import libraries

In [1]:
from get_loader import get_loader
from models import Encoder, Decoder
import torch
import torch.nn as nn
from utils import *

## Load train and validation loaders

In [2]:
#image_path = '../../CW/Data/train2017'
#captions_path = '../../CW/Data/annotations_trainval2017/annotations/captions_train2017.json'
image_path = '../../Data/train2017/train2017'
captions_path = '../../Data/annotations/captions_train2017.json'
freq_threshold = 5
caps_per_image = 5
batch_size = 32
shuffle = True

In [3]:
train_loader_params = {
    'images_path': image_path,
    'captions_path': captions_path,
    'freq_threshold': freq_threshold,
    'caps_per_image': 5,
    'batch_size': batch_size,
    'shuffle': shuffle,
    'mode': 'train',
    'idx2word': None,
    'word2idx': None
}

train_loader, train_dataset = get_loader(**train_loader_params)

val_loader_params = {
    'images_path': image_path,
    'captions_path': captions_path,
    'freq_threshold': freq_threshold,
    'caps_per_image': 3,
    'batch_size': batch_size,
    'shuffle': shuffle,
    'mode': 'validation',
    'idx2word': train_dataset.vocab.idx2word,
    'word2idx': train_dataset.vocab.word2idx
}

val_loader, val_dataset = get_loader(**val_loader_params)

In [4]:
train_dataset.vocab.export_vocab('../vocabulary')

In [5]:
len(train_dataset.vocab.idx2word)

3387

## Training

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
# for data loader
BATCH_SIZE = 32
CAPS_PER_IMAGE = 5 # how many captions for each image to include in data set

# for encoder and decoder
EMBED_SIZE = 512 # dimension of vocab embedding vector
HIDDEN_SIZE = 512
NUM_LAYERS = 1 #hidden layers in LTSM
vocab_size = len(train_dataset.vocab.idx2word)

# training parameters
TOTAL_EPOCH = 1000
CHECKPOINT = '../model/model_v2/model_v2_0.pth'
PRINT_EVERY = 500 # run print_every batches and then

In [8]:
encoder = Encoder(embed_size=EMBED_SIZE, pretrained=True)
decoder = Decoder(embed_size=EMBED_SIZE, hidden_size=HIDDEN_SIZE, vocab_size=vocab_size, num_layers=NUM_LAYERS)

In [9]:
# the loss is a cross entropy loss and ignore the index of <PAD> since it doesn't make any difference
criterion = nn.CrossEntropyLoss(ignore_index=train_dataset.vocab.word2idx["<PAD>"]).cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss(ignore_index=train_dataset.vocab.word2idx["<PAD>"])

# combine the parameters of decoder and encoder
params = list(decoder.parameters()) + list(encoder.embed.parameters())

# Adam optimizer
opt_pars = {'lr':1e-3, 'weight_decay':1e-3, 'betas':(0.9, 0.999), 'eps':1e-08}
optimizer = optim.Adam(params, **opt_pars)

In [None]:
train_params = {
    'encoder': encoder,
    'decoder': decoder,
    'criterion': criterion,
    'optimizer': optimizer,
    'train_loader': train_loader,
    'val_loader': val_loader,
    'total_epoch': TOTAL_EPOCH,
    'device': device,
    'checkpoint_path': CHECKPOINT,
    'print_every': PRINT_EVERY,
    'load_checkpoint': False
}

training_loss, validation_loss = train(**train_params) 

Epoch: [0/1000] || Step: [0/1563] || Average Training Loss: 8.1233
