In [1]:
mountedDB = {}

In [2]:
!pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
!pip install -U nltk

Writing to /home/mw/.config/pip/pip.conf
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


In [3]:
!mkdir /home/mw/project/coco2014
!ln -s /home/mw/input/020614521/test2014/test2014 /home/mw/project/coco2014
!ln -s /home/mw/input/02066579/val2014/* /home/mw/project/coco2014
!ln -s /home/mw/input/020628093/train2014-1/train2014-1 /home/mw/project/coco2014
!ln -s /home/mw/input/020631458/train2014-2/train2014-2 /home/mw/project/coco2014

mkdir: cannot create directory ‘/home/mw/project/coco2014’: File exists
ln: failed to create symbolic link '/home/mw/project/coco2014/test2014': File exists
ln: failed to create symbolic link '/home/mw/project/coco2014/val2014': File exists
ln: failed to create symbolic link '/home/mw/project/coco2014/train2014-1': File exists
ln: failed to create symbolic link '/home/mw/project/coco2014/train2014-2': File exists


In [4]:
import torch
import torchvision.transforms as transforms
import torch.nn as nn
import torch.backends.cudnn as cudnn
from models import Encoder, DecoderWithRNN, DecoderWithAttention
from datasets import *
from solver import *

In [5]:
cfg = {
    # Data parameters
    'data_folder' : '/home/mw/work/work_dir',  
    'data_name' : 'coco_5_cap_per_img_5_min_word_freq',  # base name shared by data files
    # Model parameters
    'embed_dim' : 512,  # dimension of word embeddings
    'attention_dim' : 512,  # dimension of attention linear layers
    'decoder_dim' : 512,  # dimension of decoder RNN
    'dropout' : 0.5,
    'device' : torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),  # sets device for model and PyTorch tensors
    # Training parameters
    'start_epoch' : 0,
    'epochs' : 10,  # number of epochs to train for (if early stopping is not triggered)
    'epochs_since_improvement' : 0,  # keeps track of number of epochs since there's been an improvement in validation BLEU
    'batch_size' : 32,
    'workers' : 1,  # for data-loading; right now, only 1 works with h5py
    'encoder_lr' : 1e-4,  # learning rate for encoder if fine-tuning
    'decoder_lr' : 4e-4,  # learning rate for decoder
    'grad_clip' : 5.,  # clip gradients at an absolute value of
    'alpha_c' : 1.,  # regularization parameter for 'doubly stochastic attention', as in the paper
    'best_bleu4' : 0.,  # BLEU-4 score right now
    'print_freq' : 100,  # print training/validation stats every __ batches
    'fine_tune_encoder' : True,  # fine-tune encoder or not
    'checkpoint' : '/home/mw/work/work_dir/coco2014/checkpoint_coco_5_cap_per_img_5_min_word_freq.pth.tar',  # path to checkpoint, None if none
    'attention' : True, # train decoder with attention or not
}
cudnn.benchmark = True  # set to true only if inputs to model are fixed size; otherwise lot of computational overhead

In [6]:
word_map_file = os.path.join(cfg['data_folder'], 'WORDMAP_' + cfg['data_name'] + '.json')
with open(word_map_file, 'r') as j:
    word_map = json.load(j)
cfg['vocab_size'] = len(word_map)

In [7]:
os.environ['TORCH_HOME'] = '/home/mw/work/work_dir/coco2014'

In [8]:
if cfg['checkpoint'] is None:
    encoder = Encoder()
    encoder.fine_tune(cfg['fine_tune_encoder'])
    encoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, encoder.parameters()),
                                         lr=cfg['encoder_lr']) if cfg['fine_tune_encoder'] else None
    if not cfg['attention']:
        decoder = DecoderWithRNN(cfg)
    else:
        decoder = DecoderWithAttention(cfg)
    decoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, decoder.parameters()),
                                         lr=cfg['decoder_lr'])
else:
    checkpoint = torch.load(cfg['checkpoint'])
    cfg['start_epoch'] = checkpoint['epoch'] + 1
    cfg['epochs_since_improvement'] = checkpoint['epochs_since_improvement']
    cfg['best_bleu4'] = checkpoint['bleu-4']
    encoder = checkpoint['encoder']
    encoder_optimizer = checkpoint['encoder_optimizer']
    decoder = checkpoint['decoder']
    decoder_optimizer = checkpoint['decoder_optimizer']
    if cfg['fine_tune_encoder'] is True and encoder_optimizer is None:
        encoder.fine_tune(cfg['fine_tune_encoder'])
        encoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, encoder.parameters()),
                                             lr=cfg['encoder_lr'])

In [9]:
# Move to GPU, if available
decoder = decoder.to(cfg['device'])
encoder = encoder.to(cfg['device'])

In [10]:
# Loss function
criterion = nn.CrossEntropyLoss().to(cfg['device'])

In [11]:
# Custom dataloaders
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
train_loader = torch.utils.data.DataLoader(
    CaptionDataset(cfg['data_folder'], cfg['data_name'], 'TRAIN', transform=transforms.Compose([normalize])),
    batch_size=cfg['batch_size'], shuffle=True, num_workers=cfg['workers'], pin_memory=True)
val_loader = torch.utils.data.DataLoader(
    CaptionDataset(cfg['data_folder'], cfg['data_name'], 'VAL', transform=transforms.Compose([normalize])),
    batch_size=cfg['batch_size'], shuffle=True, num_workers=cfg['workers'], pin_memory=True)

In [12]:
# Epochs
for epoch in range(cfg['start_epoch'], cfg['epochs']):

    # Decay learning rate if there is no improvement for 8 consecutive epochs, and terminate training after 20
    if cfg['epochs_since_improvement'] == 20:
        break
    if cfg['epochs_since_improvement'] > 0 and cfg['epochs_since_improvement'] % 8 == 0:
        adjust_learning_rate(decoder_optimizer, 0.8)
        if cfg['fine_tune_encoder']:
            adjust_learning_rate(encoder_optimizer, 0.8)

    # One epoch's training
    train(train_loader=train_loader,
          encoder=encoder,
          decoder=decoder,
          criterion=criterion,
          encoder_optimizer=encoder_optimizer,
          decoder_optimizer=decoder_optimizer,
          epoch=epoch,
          cfg=cfg)
    
    # One epoch's validation
    recent_bleu4 = validate(val_loader=val_loader,
                            encoder=encoder,
                            decoder=decoder,
                            criterion=criterion,
                            word_map=word_map,
                            cfg=cfg)

    # Check if there was an improvement
    is_best = recent_bleu4 > cfg['best_bleu4']
    cfg['best_bleu4'] = max(recent_bleu4, cfg['best_bleu4'])
    if not is_best:
        cfg['epochs_since_improvement'] += 1
        print("\nEpochs since last improvement: %d\n" % (cfg['epochs_since_improvement'],))
    else:
        cfg['epochs_since_improvement'] = 0

    # Save checkpoint
    save_checkpoint(cfg['data_name'], epoch, cfg['epochs_since_improvement'], encoder, decoder, encoder_optimizer,
                    decoder_optimizer, recent_bleu4, is_best)

  alpha = F.softmax(e)


Epoch: [4][0/17702]	Batch Time 3.541 (3.541)	Data Load Time 0.359 (0.359)	Loss 3.0340 (3.0340)	Top-5 Accuracy 80.175 (80.175)
Epoch: [4][100/17702]	Batch Time 0.737 (0.767)	Data Load Time 0.000 (0.004)	Loss 3.2242 (3.0431)	Top-5 Accuracy 74.713 (78.795)
Epoch: [4][200/17702]	Batch Time 0.782 (0.765)	Data Load Time 0.000 (0.002)	Loss 3.1111 (3.0387)	Top-5 Accuracy 77.635 (78.651)
Epoch: [4][300/17702]	Batch Time 0.796 (0.770)	Data Load Time 0.000 (0.001)	Loss 3.4397 (3.0337)	Top-5 Accuracy 72.423 (78.666)
Epoch: [4][400/17702]	Batch Time 0.790 (0.773)	Data Load Time 0.000 (0.001)	Loss 2.9034 (3.0369)	Top-5 Accuracy 81.301 (78.619)
Epoch: [4][500/17702]	Batch Time 0.793 (0.777)	Data Load Time 0.000 (0.001)	Loss 3.1251 (3.0367)	Top-5 Accuracy 78.415 (78.601)
Epoch: [4][600/17702]	Batch Time 0.793 (0.779)	Data Load Time 0.000 (0.001)	Loss 3.2902 (3.0381)	Top-5 Accuracy 75.419 (78.581)
Epoch: [4][700/17702]	Batch Time 0.789 (0.781)	Data Load Time 0.000 (0.001)	Loss 2.8632 (3.0460)	Top-5 Acc