## Imports

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.models as m
from torch.utils.data import DataLoader
from transforms import *
from torchvision.transforms import Compose
from torchsummary import summary
from repeat_image_dataset import RepeatImageDataset
from random_caption_dataset import RandomCaptionDataset
from text_preprocessing import *

print(f'PyTorch version: {torch.__version__}')
print("GPU found :)" if torch.cuda.is_available() else "No GPU :(")

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

PyTorch version: 1.7.1
GPU found :)


In [3]:
IMAGE_SIZE = 224
EMBEDDING_SIZE = 300

## Datas section

In [4]:
# Init text preprocessing class

text_preprocessor = TextPreprocessor('./flickr8k/annotations/annotations_image_id.csv', sep=';')

In [10]:
transforms = Compose([Rescale(256), 
                      RandomCrop(IMAGE_SIZE), 
                      ToTensor(), Normalize(),
                      OneHotEncode(text_preprocessor)])

train_repeat_dataset = RepeatImageDataset('./flickr8k/images/train/', transform=transforms)
train_random_dataset = RandomCaptionDataset('./flickr8k/images/train/', transform=transforms)

array([0., 0., 0., ..., 0., 0., 0.])

In [7]:
batch_size = 1

# Build data loaders
train_repeat_loader = DataLoader(train_repeat_dataset, batch_size=batch_size)
train_random_loader = DataLoader(train_random_dataset, batch_size=batch_size)

## Model section

In [6]:
# retrieve pretrained model for features extraction
resnet = m.resnet18(pretrained=True)
#resnet

In [7]:
# Keep only the feature extraction layers of the model
cnn = nn.Sequential(*(list(resnet.children())[:-2]), 
                    nn.Flatten(), 
                    nn.Linear(25088, EMBEDDING_SIZE)).to(device, dtype=torch.float)
#summary(cnn, (3, IMAGE_SIZE, IMAGE_SIZE))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 112, 112]           9,408
       BatchNorm2d-2         [-1, 64, 112, 112]             128
              ReLU-3         [-1, 64, 112, 112]               0
         MaxPool2d-4           [-1, 64, 56, 56]               0
            Conv2d-5           [-1, 64, 56, 56]          36,864
       BatchNorm2d-6           [-1, 64, 56, 56]             128
              ReLU-7           [-1, 64, 56, 56]               0
            Conv2d-8           [-1, 64, 56, 56]          36,864
       BatchNorm2d-9           [-1, 64, 56, 56]             128
             ReLU-10           [-1, 64, 56, 56]               0
       BasicBlock-11           [-1, 64, 56, 56]               0
           Conv2d-12           [-1, 64, 56, 56]          36,864
      BatchNorm2d-13           [-1, 64, 56, 56]             128
             ReLU-14           [-1, 64,

## Build LSTM + Embedding

In [8]:
vocab_size = text_preprocessor.vocab_size

# RNN with LSTM of  layer
class LSTMCaptioning(nn.Module):
    
    def __init__(self, input_size, hidden_size):
        super(LSTMCaptioning, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size)
        
    def forward(self, x, previous_state):
         
        # Get hidden states for each t (out) , and latest one (h = (ht, ct))
        out, (hn, cn) = self.lstm(x, previous_state)
        
        # Compute probability distribution over all words for this t
        pt = F.softmax(out, dim=1)
                           
        return out, (hn, cn), pt

# Not trained embedding layer to encode words to hidden space
embedding = nn.Embedding(vocab_size, EMBEDDING_SIZE).to(device)

## Train model

In [9]:
input_size = EMBEDDING_SIZE
hidden_size = vocab_size

model = LSTMCaptioning(input_size, hidden_size).to(device, dtype=torch.float)

learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

num_epoch = 10
loss_function = nn.NLLLoss()

# Random init the lstm state
h0 = torch.rand((1, batch_size, hidden_size)).to(device, dtype=torch.float)
c0 = torch.rand((1, batch_size, hidden_size)).to(device, dtype=torch.float)


for epoch in range(num_epoch):
    for i, sample in enumerate(train_random_loader):
        
        image = sample['image'].to(device, dtype=torch.float)
        caption = sample['caption'].to(device, dtype=torch.long)
        
        print(caption.shape)
        
        # Reset grad
        model.zero_grad()
        
        # Get the input image embedding 
        image_embedding = cnn(image).view(1, batch_size, EMBEDDING_SIZE)
        
        # Forward pass for t=-1: image
        _, (hn, cn), probs = model(image_embedding, (h0, c0))
        
        del image_embedding
        del image
        
        # Compute loss for 1st word prediction
        loss = loss_function(probs, caption[0])
        
        # Forward pass for t>=0: n - 1 first words of the sentence
        for j, word in enumerate(caption[:-1]):
            
            # Get index of the word in embedding matrix
            idxs = torch.argmax(word)
            
            # Encode word to hidden space
            word_embedding = embedding(idxs).view(1, batch_size, EMBEDDING_SIZE)
            
            # Feed the rnn
            out, (hn, cn), probs = model(word_embedding, (hn, cn))
            
            # Add current word's loss
            loss += loss_function(probs, caption[j+1])

        
        # Compute loss and backprop
        loss.backward()
        optimizer.step()
        
        
        # Debug
        if((i+1) % (len(train_random_loader)/10) == 0):
            print(
                        f"Epoch [{epoch + 1}/{num_epoch}]"
                        f", step [{i + 1}/{step_count}]"
                        f", loss: {loss.item():.4f}"
                    )
        

RuntimeError: CUDA out of memory. Tried to allocate 1.38 GiB (GPU 0; 1.96 GiB total capacity; 661.49 MiB already allocated; 649.00 MiB free; 688.00 MiB reserved in total by PyTorch)

In [None]:
# Save model 
torch.save(model.state_dict(), './models/')