In [1]:
!pip install pycocotools



In [1]:
import torch
import torch.nn as nn
from torchvision import transforms
import sys
from pycocotools.coco import COCO
from vocabulary import Vocabulary
from data_loader import get_loader
import math


## TODO #1: Select appropriate values for the Python variables below.

batch_size = 16         # batch size
vocab_threshold = 5        # minimum word count threshold
vocab_from_file = True    # if True, load existing vocab file

# (Optional) TODO #2: Amend the image transform below.
transform_train = transforms.Compose([ 
    transforms.Resize(256),                          # smaller edge of image resized to 256
    transforms.RandomCrop(224),                      # get 224x224 crop from random location
    transforms.RandomHorizontalFlip(),               # horizontally flip image with probability=0.5
    transforms.ToTensor(),                           # convert the PIL Image to a tensor
    transforms.Normalize((0.485, 0.456, 0.406),      # normalize image for pre-trained model
                         (0.229, 0.224, 0.225))])

# Build data loader.
data_loader = get_loader(transform=transform_train,
                         mode='train',
                         batch_size=batch_size,
                         vocab_threshold=vocab_threshold,
                         vocab_from_file=vocab_from_file)

# The size of the vocabulary.
vocab_size = len(data_loader.dataset.vocab)
print(vocab_size)




Vocabulary successfully loaded from vocab.pkl file!
loading annotations into memory...


  0%|          | 0/414113 [00:00<?, ?it/s]

Done (t=0.56s)
creating index...
index created!
Obtaining caption lengths...


100%|██████████| 414113/414113 [00:37<00:00, 10970.18it/s]


8856


In [2]:
import torch
import torch.nn as nn
import torchvision.models as models


class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        super(EncoderCNN, self).__init__()
        resnet = models.resnet50(pretrained=True)
        for param in resnet.parameters():
            param.requires_grad_(False)
        
        modules = list(resnet.children())[:-1]
        self.resnet = nn.Sequential(*modules)
        self.embed = nn.Linear(resnet.fc.in_features, embed_size)
        self.train_params = list(self.embed.parameters())

    def forward(self, images):
        features = self.resnet(images)
        features = features.view(features.size(0), -1)
        features = self.embed(features)
        return features
    

class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers=1):
        super().__init__()
        self.embedding_layer = nn.Embedding(vocab_size, embed_size)
        
        self.lstm = nn.LSTM(input_size = embed_size,hidden_size = hidden_size,
                            num_layers = num_layers, batch_first = True)
        
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.train_params = list(self.parameters())
    
    def forward(self, features, captions):
        captions = captions[:, :-1]
        embed = self.embedding_layer(captions)
        embed = torch.cat((features.unsqueeze(1), embed), dim = 1)
        lstm_outputs, _ = self.lstm(embed)
        out = self.linear(lstm_outputs)
        
        return out

    def sample(self, inputs, states=None, max_len=20):
        " accepts pre-processed image tensor (inputs) and returns predicted sentence (list of tensor ids of length max_len) "
        output_sentence = []
        for i in range(max_len):
            lstm_outputs, states = self.lstm(inputs, states)
            lstm_outputs = lstm_outputs.squeeze(1)
            out = self.linear(lstm_outputs)
            last_pick = out.max(1)[1]
            output_sentence.append(last_pick.item())
            inputs = self.embedding_layer(last_pick).unsqueeze(1)
        
        return output_sentence
    
    def beam_sample(self, inputs, states=None, max_len=20, k=1):
        " accepts pre-processed image tensor (inputs) and returns predicted sentence (list of tensor ids of length max_len) "
        possible_seq = [(1, inputs, states)]
        for i in range(max_len):
            to_pick = []
            for probs,seq,states in possible_seq:
                inputs = self.embedding_layer(seq[-1])
                lstm_outputs, states = self.lstm(inputs, states)
                out = self.linear(lstm_outputs).squeeze(0)
                sorted_out, indices = torch.sort(out, 1)
                
                for j in range(k):
                    to_pick.append((probs + nn.functional.log_softmax(sorted_out[i]), inputs + [indices[i]], states) )
                 
            to_pick.sort(reverse=True)
            possible_seq = to_pick[:k]
            
        return to_pick[0]


class EncoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, context_size, num_layers=1):
        super().__init__()
        #self.embedding_layer = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(input_size = embed_size,hidden_size = hidden_size,
                            num_layers = num_layers, batch_first = True)
        self.linear = nn.Linear(hidden_size, context_size)
        self.train_params = list(self.parameters())
    
    def forward(self, captions):
        #captions = captions[:, :-1]
        #embed = self.embedding_layer(captions)
        lstm_outputs, _ = self.lstm(captions)
        out = self.linear(lstm_outputs[:,-1,:].squeeze(1))
        return out

    
class Generator(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, context_size, num_layers=1):
        super(Generator, self).__init__()
        self.cnn = EncoderCNN(context_size)
        self.rnn = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers=num_layers)
        self.train_params = self.cnn.train_params + self.rnn.train_params 

    def forward(self, images, captions):
        features = self.cnn(images)
        output = self.rnn(features, captions)
        return output, features


class Discriminator(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, context_size, num_layers=1):
        super(Discriminator, self).__init__()
        self.embed = nn.Linear(vocab_size, embed_size)
        self.rnn = EncoderRNN(embed_size, hidden_size, vocab_size, context_size, num_layers=num_layers)
        #self.crit = nn.CosineSimilarity(dim=0, eps=1e-6)
        self.train_params = list(self.embed.parameters()) + self.rnn.train_params
        

    def forward(self, image_feat, captions):
        bs = image_feat.size(0)
        embed = self.embed(captions)
        cap_feat = self.rnn(embed)
        return cap_feat



In [3]:
context_size = 300
embed_size = 300           # dimensionality of image and word embeddings
hidden_size = 512          # number of features in hidden state of the RNN decoder
num_epochs = 3             # number of training epochs
save_every = 1000             # determines frequency of saving model weights
print_every = 100          # determines window for printing average loss
log_file = 'training_log.txt'       # name of file with saved training loss and perplexity

# Initialize the encoder and decoder. 
generator = Generator(embed_size, hidden_size, vocab_size, context_size, num_layers=2)
discriminator = Discriminator(embed_size, hidden_size, vocab_size, context_size, num_layers=2)


# Move models to GPU if CUDA is available. 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
generator.to(device)
discriminator.to(device)

# Define the loss function.
cosineSim = nn.CosineSimilarity(dim=0, eps=1e-6)
pos_weight = torch.Tensor([batch_size-1])
criterion_A = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss()
criterion_B = nn.BCELoss(pos_weight).cuda()


# TODO #3: Specify the learnable parameters of the model.
params = generator.train_params + discriminator.train_params

# TODO #4: Define the optimizer.
optimizer = torch.optim.Adam(params = params, lr = 0.0001)

# Set the total number of training steps per epoch.
total_step = math.ceil(len(data_loader.dataset.caption_lengths) / data_loader.batch_sampler.batch_size)

In [5]:
import torch.utils.data as data
import numpy as np
import os
import time
import random


generator.train()
discriminator.train()

for epoch in range(1, num_epochs+1):
    
    for i_step in range(1, total_step+1):
        
        # Randomly sample a caption length, and sample indices with that length.
        indices = data_loader.dataset.get_train_indices()
        # Create and assign a batch sampler to retrieve a batch with the sampled indices.
        new_sampler = data.sampler.SubsetRandomSampler(indices=indices)
        data_loader.batch_sampler.sampler = new_sampler
        
        # Obtain the batch.
        images, captions = next(iter(data_loader))

        # Move batch of images and captions to GPU if CUDA is available.
        images = images.to(device)
        captions = captions.to(device)
        
        # Zero the gradients.
        generator.zero_grad()
        discriminator.zero_grad()
        
        # Pass the inputs through the CNN-RNN model.
        out, img_feats = generator(images, captions)
        
        rho = 0.25
        if random.random() < rho:
            gen_captions = nn.functional.softmax(out)
        else:
            gen_captions = nn.functional.gumbel_softmax(out, hard=True)
                
        cap_feats = discriminator(img_feats, gen_captions).to(device)
        
        max_loss_1 = 0
        max_loss_2 = 0
        pos_pair_term = cosineSim(cap_feats[0], img_feats[0])
        for i in range(1,batch_size):
            max_loss_1 += max(0, 1 - pos_pair_term + cosineSim(cap_feats[0], img_feats[i]))
            max_loss_2 += max(0, 1 - pos_pair_term + cosineSim(cap_feats[i], img_feats[0]))
        
        
        
        # Calculate the batch loss.
        loss_A = criterion_A(out.view(-1, vocab_size), captions.view(-1))
        loss_B = (max_loss_1 + max_loss_2)/(batch_size-1)
        
        B_weight = 0.5
        loss = loss_A + B_weight*loss_B
        
        # Backward pass.
        loss.backward()
        
        # Update the parameters in the optimizer.
        optimizer.step()
            
        # Get training statistics.
        stats = 'Epoch [%d/%d], Step [%d/%d], Loss: (%.4f, %.4f, %.4f), Perplexity: %5.4f' % (epoch, num_epochs, i_step, total_step, loss.item(), loss_A.item(), B_weight*loss_B.item(), np.exp(loss.item()))
        
        # Print training statistics (on same line).
        print('\r' + stats, end="")
        sys.stdout.flush()
        
        # Print training statistics (on different line).
        if i_step % print_every == 0:
            print('\r' + stats)
            
        # Save the weights.
        if i_step % save_every == 0:
            torch.save(generator.state_dict(), os.path.join('/media/test/UNTITLED/scratch', f'coop-generator-{epoch}-{i_step}.pkl'))
            torch.save(discriminator.state_dict(), os.path.join('/media/test/UNTITLED/scratch', f'coop-discriminator-{epoch}-{i_step}.pkl'))





Epoch [1/3], Step [3/25883], Loss: (4.5959, 3.6289, 0.9670), Perplexity: 99.07645



Epoch [1/3], Step [100/25883], Loss: (4.9586, 3.9503, 1.0082), Perplexity: 142.3885
Epoch [1/3], Step [200/25883], Loss: (4.9526, 3.9594, 0.9932), Perplexity: 141.5416
Epoch [1/3], Step [300/25883], Loss: (4.8839, 3.8979, 0.9860), Perplexity: 132.1391
Epoch [1/3], Step [400/25883], Loss: (5.3078, 4.2650, 1.0428), Perplexity: 201.9135
Epoch [1/3], Step [500/25883], Loss: (4.6696, 3.6394, 1.0302), Perplexity: 106.6579
Epoch [1/3], Step [600/25883], Loss: (5.1971, 4.1472, 1.0499), Perplexity: 180.7523
Epoch [1/3], Step [700/25883], Loss: (4.6846, 3.6907, 0.9939), Perplexity: 108.2631
Epoch [1/3], Step [800/25883], Loss: (4.6077, 3.6290, 0.9787), Perplexity: 100.2547
Epoch [1/3], Step [900/25883], Loss: (4.3824, 3.3477, 1.0347), Perplexity: 80.02904
Epoch [1/3], Step [1000/25883], Loss: (4.8732, 3.8323, 1.0409), Perplexity: 130.7389
Epoch [1/3], Step [1100/25883], Loss: (4.9259, 3.9434, 0.9825), Perplexity: 137.8186
Epoch [1/3], Step [1200/25883], Loss: (4.2270, 3.2264, 1.0006), Perplexity