In [None]:
from models import *
from solver import *
import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

# 0. create input files

In [None]:
import gzip
import shutil
with gzip.open('dataset_coco.json.gz', 'rb') as f_in:
    with open('dataset_coco.json', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [None]:
from datasets import create_input_files


# Create input files (along with word map)
create_input_files(dataset='coco',
                   karpathy_json_path='./dataset_coco.json',
                   image_folder='/datasets/COCO-2015/',
                   captions_per_image=5,
                   min_word_freq=5,
                   output_folder='./inputData/',
                   max_len=50)


In [None]:
# Data parameters
data_folder = './inputData/'  # folder with data files saved by create_input_files.py
data_name = 'coco_5_cap_per_img_5_min_word_freq'  # base name shared by data files

# Read word map
word_map_file = os.path.join(data_folder, 'WORDMAP_' + data_name + '.json')
with open(word_map_file, 'r') as j:
    word_map = json.load(j)
benchmark = False # if GPU is rtx2080, else True

# 1.  NIC

In [None]:
# Model parameters
embed_encoder_dim = 512  # dimension of word embeddings
hidden_dim = 512  # dimension of decoder RNN
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # sets device for model and PyTorch tensors
print(device)

## 1.1 vgg16 and NIC

In [None]:
encoder = Encoder()

decoder = DecoderWithoutAttention(image_output_dim = encoder.output_dim, hidden_dim = hidden_dim, 
                                  vocab_size =len(word_map), embed_encoder_dim=embed_encoder_dim, 
                                  device = device)
checkpoint = None
# checkpoint = 'BEST_checkpoint_vgg16_NIC_coco_5_cap_per_img_5_min_word_freq.pth.tar'

In [None]:

backprop_deep(encoder, decoder, data_folder, data_name, word_map, 
              epochs = 120, decoder_lr = 4e-4 , checkpoint = checkpoint, device = device, benchmark = benchmark)

# 1.2 densenet161 and NIC

In [None]:
encoder = Encoder('densenet161')

decoder = DecoderWithoutAttention(image_output_dim = encoder.output_dim,hidden_dim = hidden_dim, vocab_size =len(word_map),
                                      embed_encoder_dim=embed_encoder_dim, device = device)
checkpoint = None
# checkpoint = 'BEST_checkpoint_densenet161_NIC_coco_5_cap_per_img_5_min_word_freq.pth.tar'

In [None]:

backprop_deep(encoder, decoder, data_folder, data_name, word_map, 
              epochs = 120, decoder_lr = 4e-4 , checkpoint = checkpoint, device = device, benchmark = benchmark)

# 1.3  resnet101 and NIC

In [None]:
encoder = Encoder('resnet101')

decoder = DecoderWithoutAttention(image_output_dim = encoder.output_dim,hidden_dim = hidden_dim, vocab_size =len(word_map),
                                      embed_encoder_dim=embed_encoder_dim, device = device)
checkpoint = None
# checkpoint = 'BEST_checkpoint_resnet101_NIC_coco_5_cap_per_img_5_min_word_freq.pth.tar'

In [None]:

backprop_deep(encoder, decoder, data_folder, data_name, word_map, 
              epochs = 120, decoder_lr = 4e-4 , checkpoint = checkpoint, device = device, benchmark = benchmark)

# 2. NICA

In [None]:
# Model parameters
emb_dim = 512  # dimension of word embeddings
attention_dim = 512  # dimension of attention linear layers
hidden_dim = 512  # dimension of decoder RNN
dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # sets device for model and PyTorch tensors

# 2.1 vgg16 and NICA

In [None]:
encoder = Encoder()

decoder = DecoderWithAttention(attention_dim=attention_dim, embed_dim=emb_dim, hidden_dim=hidden_dim,
                                   vocab_size=len(word_map), encoder_dim = encoder.output_dim, 
                                   dropout=dropout, device = device)
checkpoint = None
# checkpoint = 'BEST_checkpoint_vgg16_NICA_coco_5_cap_per_img_5_min_word_freq.pth.tar'

In [None]:

backprop_deep(encoder, decoder, data_folder, data_name, word_map, 
              epochs = 120, decoder_lr = 4e-4 , checkpoint = checkpoint, device = device, benchmark = benchmark)

# 2.2 densenet161 and NICA

In [None]:
encoder = Encoder('densenet161')

decoder = DecoderWithAttention(attention_dim=attention_dim, embed_dim=emb_dim, hidden_dim=hidden_dim,
                                   vocab_size=len(word_map), encoder_dim = encoder.output_dim, 
                                   dropout=dropout, device = device)
checkpoint = None
checkpoint = 'BEST_checkpoint_densenet161_NICA_coco_5_cap_per_img_5_min_word_freq.pth.tar'

In [None]:

backprop_deep(encoder, decoder, data_folder, data_name, word_map, 
              epochs = 120, decoder_lr = 4e-4 , checkpoint = checkpoint, device = device, benchmark = benchmark)

# 2.3  resnet101 and NICA

In [None]:
encoder = Encoder('resnet101')

decoder = DecoderWithAttention(attention_dim=attention_dim, embed_dim=emb_dim, hidden_dim=hidden_dim,
                                   vocab_size=len(word_map), encoder_dim = encoder.output_dim, 
                                   dropout=dropout, device = device)
checkpoint = None
checkpoint = 'BEST_checkpoint_resnet101_NICA_coco_5_cap_per_img_5_min_word_freq.pth.tar'

In [None]:

backprop_deep(encoder, decoder, data_folder, data_name, word_map, 
              epochs = 120, decoder_lr = 4e-4 , checkpoint = checkpoint, device = device, benchmark = benchmark)