In [1]:
import requests
import os
import zipfile
import sys

import json
import numpy as np
import pickle

from collections import Counter

import torch
from torch import nn
import torch.nn.functional as F

import tqdm
from tqdm import tqdm_notebook
from IPython.display import clear_output

from torch.utils.data import Dataset, DataLoader

from random import choice

In [2]:
def download_file(url):
    local_filename = url.split('/')[-1]
    # NOTE the stream=True parameter below
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            total_length = r.headers.get('content-length')
            if total_length is not None:
                dl = 0
                total_length = int(total_length)
            for chunk in r.iter_content(chunk_size=8192): 
                # If you have chunk encoded response uncomment if
                # and set chunk_size parameter to None.
                #if chunk: 
                f.write(chunk)
                if total_length is not None:
                    dl += len(chunk)
                    done = int(50 * dl / total_length)
                    sys.stdout.write("\r[%s%s]" % ('=' * done, ' ' * (50-done)) )    
                    sys.stdout.flush()
    return local_filename

In [3]:
if not os.path.exists('data'):
    link = 'https://github.com/Neznakomec/ml-project-transformers/releases/download/release_1/1_preprocessed_files.zip'
    download_file(link)
    with zipfile.ZipFile('1_preprocessed_files.zip', 'r') as zip_ref:
        zip_ref.extractall('./data')
    os.remove('1_preprocessed_files.zip')



# 1. Building a vocabulary of words

In [4]:
captions_dict = json.load(open('data/captions_tokenized.json'))
captions = list(captions_dict.values())

In [5]:
# Build a Vocabulary

word_counts = Counter()

# Compute word frequencies for each word in captions. See code above for data structure

for img_i in range(len(captions)):
    for caption_i in range(len(captions[img_i])):
        sentence = captions[img_i][caption_i]
        for token in sentence:
          word_counts[token] += 1

In [None]:
word_counts

Counter({'#START#': 40460,
         'a': 69581,
         'man': 8295,
         'in': 18975,
         'street': 971,
         'racer': 66,
         'armor': 2,
         'be': 13268,
         'examine': 22,
         'the': 10724,
         'tire': 123,
         'of': 6713,
         'another': 956,
         "'s": 400,
         'motorbike': 46,
         '.': 36579,
         '#END#': 40460,
         'two': 5636,
         'drive': 147,
         'white': 3941,
         'bike': 982,
         'down': 1825,
         'road': 389,
         'motorist': 1,
         'ride': 1638,
         'along': 527,
         'on': 10746,
         'their': 693,
         'vehicle': 101,
         'that': 397,
         'oddly': 2,
         'design': 6,
         'and': 8855,
         'color': 56,
         'person': 3973,
         'small': 1345,
         'race': 656,
         'car': 494,
         'by': 1248,
         'green': 1226,
         'hill': 478,
         'uniform': 385,
         'firefighter': 10,
         'extin

In [6]:
vocab  = ['#UNK#', '#START#', '#END#', '#PAD#']
vocab += [k for k, v in word_counts.items() if v >= 5 if k not in vocab]
n_tokens = len(vocab)

word_to_index = {w: i for i, w in enumerate(vocab)}

In [7]:
eos_ix = word_to_index['#END#']
unk_ix = word_to_index['#UNK#']
pad_ix = word_to_index['#PAD#'] # before idx=3 (PAD) are special tokens, after idx=3 - real words

def as_matrix(sequences, max_len=None):
    """ Convert a list of tokens into a matrix with padding """
    max_len = max_len or max(map(len,sequences))
    
    matrix = np.zeros((len(sequences), max_len), dtype='int32') + pad_ix
    for i,seq in enumerate(sequences):
        row_ix = [word_to_index.get(word, unk_ix) for word in seq[:max_len]]
        matrix[i, :len(row_ix)] = row_ix
    
    return matrix

In [8]:
# Try it out on several descriptions of a random image
as_matrix(captions_dict['1305564994_00513f9a5b'])

array([[ 1,  4,  5,  6,  7,  8,  0,  9, 10, 11, 12, 13, 14,  8, 15, 16,
        17,  2],
       [ 1, 18,  8, 19,  4, 20, 21, 22,  4, 23, 17,  2,  3,  3,  3,  3,
         3,  3],
       [ 1, 18,  0,  9, 24, 25, 26, 27, 28, 29,  9,  0, 30, 31, 32, 17,
         2,  3],
       [ 1, 18, 33,  9,  6,  4, 34, 35, 36, 19, 37,  4, 38, 39, 17,  2,
         3,  3],
       [ 1, 18, 33,  6, 35, 40,  6,  4,  7, 36, 17,  2,  3,  3,  3,  3,
         3,  3]], dtype=int32)

In [9]:
as_matrix(captions[0])

array([[ 1,  4,  5,  6,  7,  8,  0,  9, 10, 11, 12, 13, 14,  8, 15, 16,
        17,  2],
       [ 1, 18,  8, 19,  4, 20, 21, 22,  4, 23, 17,  2,  3,  3,  3,  3,
         3,  3],
       [ 1, 18,  0,  9, 24, 25, 26, 27, 28, 29,  9,  0, 30, 31, 32, 17,
         2,  3],
       [ 1, 18, 33,  9,  6,  4, 34, 35, 36, 19, 37,  4, 38, 39, 17,  2,
         3,  3],
       [ 1, 18, 33,  6, 35, 40,  6,  4,  7, 36, 17,  2,  3,  3,  3,  3,
         3,  3]], dtype=int32)

In [10]:
class CaptionNet(nn.Module):
    
    def __init__(self, n_tokens=n_tokens, emb_size=128, lstm_units=256, cnn_feature_size=2048):
        """ A recurrent 'head' network for image captioning. See scheme above. """
        super().__init__()
        
        # a layer that converts conv features to initial_h (h_0) and initial_c (c_0)
        self.cnn_to_h0 = nn.Linear(cnn_feature_size, lstm_units)
        self.cnn_to_c0 = nn.Linear(cnn_feature_size, lstm_units)

        # create embedding for input words. Use the parameters (e.g. emb_size).
        self.embedding = nn.Embedding(num_embeddings=n_tokens, embedding_dim=emb_size)
            
        # lstm: create a recurrent core of your network. Use either LSTMCell or just LSTM. 
        # In the latter case (nn.LSTM), make sure batch_first=True
        self.lstm = nn.LSTM(input_size=emb_size, num_layers = 10,
                            hidden_size = lstm_units, batch_first = True)
            
        # create logits: linear layer that takes lstm hidden state as input and computes one number per token
        self.logits = nn.Linear(in_features = lstm_units, out_features=n_tokens)
        
    def forward(self, image_vectors, captions_ix):
        """ 
        Apply the network in training mode. 
        :param image_vectors: torch tensor containing inception vectors. shape: [batch, cnn_feature_size]
        :param captions_ix: torch tensor containing captions as matrix. shape: [batch, word_i]. 
            padded with pad_ix
        :returns: logits for next token at each tick, shape: [batch, word_i, n_tokens]
        """

        self.lstm.flatten_parameters()

        initial_cell = self.cnn_to_c0(image_vectors)
        initial_hid = self.cnn_to_h0(image_vectors)
        #print(initial_cell.shape, initial_hid.shape)
        initial_hid = initial_hid.repeat(10, 1, 1)
        initial_cell = initial_cell.repeat(10, 1, 1)
        #print(initial_cell.shape, initial_hid.shape, initial_cell[None].shape)
        
        # compute embeddings for captions_ix
        caption_embedding = self.embedding(captions_ix)

        # apply recurrent layer to captions_emb. 
        # 1. initialize lstm state with initial_* from above
        # 2. feed it with captions. Mind the dimension order in docstring
        # 3. compute logits for next token probabilities
        # Note: if you used nn.LSTM, you can just give it (initial_cell[None], initial_hid[None]) as second arg

        # lstm_out should be lstm hidden state sequence of shape [batch, caption_length, lstm_units]
        lstm_out, (hn, cn) = self.lstm.forward(caption_embedding, (initial_cell, initial_hid))

        # compute logits from lstm_out
        out = self.logits(lstm_out)
        return out

In [11]:
network = CaptionNet(n_tokens)

In [12]:
dummy_img_vec = torch.randn(len(captions[0]), 2048)
dummy_capt_ix = torch.tensor(as_matrix(captions[0]), dtype=torch.int64)

dummy_logits = network.forward(dummy_img_vec, dummy_capt_ix)

print('shape:', dummy_logits.shape)
assert dummy_logits.shape == (dummy_capt_ix.shape[0], dummy_capt_ix.shape[1], n_tokens)

shape: torch.Size([5, 18, 2363])


In [13]:
def compute_loss(network, image_vectors, captions_ix):
    """
    :param image_vectors: torch tensor containing inception vectors. shape: [batch, cnn_feature_size]
    :param captions_ix: torch tensor containing captions as matrix. shape: [batch, word_i]. 
        padded with pad_ix
    :returns: crossentropy (neg llh) loss for next captions_ix given previous ones. Scalar float tensor
    """
    
    # captions for input - all except last because we don't know next token for last one.
    captions_ix_inp = captions_ix[:, :-1].contiguous()
    captions_ix_next = captions_ix[:, 1:].contiguous()
    
    # apply the network, get predictions for captions_ix_next
    logits_for_next = network.forward(image_vectors, captions_ix_inp)
    
    # compute the loss function between logits_for_next and captions_ix_next
    # Use the mask!
    # Make sure that predicting next tokens after EOS do not contribute to loss
    # You can do that either by multiplying elementwise loss by (captions_ix_next != pad_ix)
    # or by using ignore_index in some losses.
    
    criterion = nn.CrossEntropyLoss(ignore_index=pad_ix)
    rebased_logits = logits_for_next.view(logits_for_next.shape[0] * logits_for_next.shape[1], logits_for_next.shape[2])
    rebased_captions = captions_ix_next.view(-1)

    res = criterion(rebased_logits, rebased_captions)
    return res

# 2. Loading img_codes and captions, for both train and test

In [14]:
train_img_codes = pickle.load(open('data/train_image_extracted.pkl', 'rb'))
train_img_codes.shape

torch.Size([6000, 2048])

In [15]:
js = json.load(open('data/train_image_captions.json'))
train_names = [path.split('/')[-1].split('.')[0] for path in js]
train_captions = [captions_dict[name] for name in train_names]

if not type(train_captions) is np.ndarray:
    train_captions = np.array(train_captions, dtype=object)

In [16]:
val_img_codes = pickle.load(open('data/test_image_extracted.pkl', 'rb'))
val_img_codes.shape

torch.Size([1000, 2048])

In [17]:
js = json.load(open('data/test_image_captions.json'))
val_names = [path.split('/')[-1].split('.')[0] for path in js]
val_captions = [captions_dict[name] for name in val_names]

if not type(val_captions) is np.ndarray:
    val_captions = np.array(val_captions, dtype=object)

# 3. Training network

In [18]:
def generate_batch(img_codes, captions, batch_size, max_caption_len=None):
    
    # sample random numbers for image/caption indicies
    random_image_ix = np.random.randint(0, len(img_codes), size=batch_size)
    
    # get images
    batch_images = img_codes[random_image_ix]
    
    # 5-7 captions for each image
    captions_for_batch_images = captions[random_image_ix]
    
    # pick one from a set of captions for each image
    batch_captions = list(map(choice,captions_for_batch_images))
    
    # convert to matrix
    batch_captions_ix = as_matrix(batch_captions,max_len=max_caption_len)
    
    return torch.tensor(batch_images, dtype=torch.float32), \
        torch.tensor(batch_captions_ix, dtype=torch.int64)

In [19]:
def caption_codes(captions, max_caption_len=None):
    # pick one from a set of captions for each image
    batch_captions = list(map(choice,captions))
    
    # convert to matrix
    captions_ix = as_matrix(batch_captions, max_len=max_caption_len)
    
    return torch.tensor(captions_ix, dtype=torch.int64)

In [20]:
DEVICE = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')

In [21]:
network = CaptionNet(n_tokens).to(DEVICE)
optimizer = torch.optim.Adam(network.parameters(), lr=1e-3)

batch_size = 128
n_epochs = 100
#n_batches_per_epoch = 50
n_validation_batches = 5  # how many batches are used for validation after each epoch

In [23]:
np.random.seed(0)
for epoch in range(n_epochs):
    
    train_loss = 0
    network.train()
    
    # get 1 random sentence from 5-7 candidates
    train_caption_codes = caption_codes(train_captions)
    loader_captions = torch.utils.data.DataLoader(train_caption_codes, batch_size=batch_size)
    loader_img_codes = torch.utils.data.DataLoader(train_img_codes, batch_size=batch_size)
    N = len(train_caption_codes) // batch_size + 1
    
    for batch_captions, batch_img_codes, _ in zip(loader_captions, loader_img_codes, tqdm.notebook.tqdm(range(N))):
        images, captions = batch_img_codes, batch_captions
        images = images.to(DEVICE)
        captions = captions.to(DEVICE)

        loss_t = compute_loss(network, images, captions)
        
        # clear old gradients; do a backward pass to get new gradients; then train with opt
        optimizer.zero_grad()
        loss_t.backward()
        optimizer.step()
        
        train_loss += loss_t.detach().cpu().numpy()
        
    train_loss /= len(train_caption_codes)
    
    val_loss = 0
    network.eval()
    for _ in range(n_validation_batches):
        images, captions = generate_batch(val_img_codes, val_captions, batch_size)
        images = images.to(DEVICE)
        captions = captions.to(DEVICE)
        
        with torch.no_grad():
            loss_t = compute_loss(network, images, captions)

        val_loss += loss_t.detach().cpu().numpy()

    val_loss /= n_validation_batches
    
    clear_output()
    print('\nEpoch: {}, train loss: {}, val loss: {}'.format(epoch + 1, train_loss, val_loss))

print("Finished!")


Epoch: 100, train loss: 0.026456695914268493, val loss: 4.158072090148925
Finished!


In [24]:
if True:
    torch.save(network.state_dict(), 'model_10layers.pt')
    
if False:
    network.load_state_dict(torch.load('model_10layers.pt', map_location=DEVICE))