## Imports

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.models as m
from transforms import *
from torchvision.transforms import Compose
from torchsummary import summary
from repeat_image_dataset import RepeatImageDataset
from random_caption_dataset import RandomCaptionDataset

print(f'PyTorch version: {torch.__version__}')
print("GPU found :)" if torch.cuda.is_available() else "No GPU :(")

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

PyTorch version: 1.7.1
GPU found :)


In [10]:
IMAGE_SIZE = 224

## Datas section

In [13]:
transforms = Compose([Rescale(256), RandomCrop(IMAGE_SIZE), ToTensor()])

train_repeat_dataset = RepeatImageDataset('./flickr8k/images/train/', transform=transforms)
train_random_dataset = RandomCaptionDataset('./flickr8k/images/train/', transform=transforms)

In [25]:
im = train_repeat_dataset[0]['image'].reshape((1,3,224,224)).to(device, dtype=torch.float)
im

tensor([[[[0.3001, 0.3394, 0.1218,  ..., 0.5300, 0.4467, 0.4237],
          [0.3563, 0.5572, 0.2906,  ..., 0.1595, 0.1078, 0.1156],
          [0.4119, 0.6038, 0.2315,  ..., 0.1693, 0.2155, 0.1227],
          ...,
          [0.3216, 0.0864, 0.1305,  ..., 0.1257, 0.1305, 0.1331],
          [0.5192, 0.0628, 0.0767,  ..., 0.0713, 0.0581, 0.0590],
          [0.2118, 0.0631, 0.0618,  ..., 0.1037, 0.1267, 0.2224]],

         [[0.2874, 0.2998, 0.1138,  ..., 0.5834, 0.4115, 0.3902],
          [0.3603, 0.5125, 0.2800,  ..., 0.1868, 0.1723, 0.1322],
          [0.3821, 0.5239, 0.2523,  ..., 0.2341, 0.2368, 0.1680],
          ...,
          [0.4719, 0.0370, 0.0904,  ..., 0.1049, 0.0763, 0.0841],
          [0.6379, 0.0890, 0.0509,  ..., 0.0794, 0.0614, 0.0678],
          [0.2560, 0.0490, 0.0575,  ..., 0.0678, 0.1212, 0.1889]],

         [[0.2510, 0.2600, 0.1221,  ..., 0.5579, 0.3617, 0.2730],
          [0.3363, 0.4564, 0.2899,  ..., 0.0463, 0.0929, 0.0680],
          [0.3506, 0.4818, 0.2495,  ..., 0

## Model section

In [16]:
# retrieve pretrained model for features extraction
resnet = m.resnet18(pretrained=True).to(device)
#resnet

In [28]:
# Keep only the feature extraction layers of the model
features = nn.Sequential(*(list(resnet.children())[:-2]), nn.Flatten())
summary(features, (3, 224, 224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 112, 112]           9,408
       BatchNorm2d-2         [-1, 64, 112, 112]             128
              ReLU-3         [-1, 64, 112, 112]               0
         MaxPool2d-4           [-1, 64, 56, 56]               0
            Conv2d-5           [-1, 64, 56, 56]          36,864
       BatchNorm2d-6           [-1, 64, 56, 56]             128
              ReLU-7           [-1, 64, 56, 56]               0
            Conv2d-8           [-1, 64, 56, 56]          36,864
       BatchNorm2d-9           [-1, 64, 56, 56]             128
             ReLU-10           [-1, 64, 56, 56]               0
       BasicBlock-11           [-1, 64, 56, 56]               0
           Conv2d-12           [-1, 64, 56, 56]          36,864
      BatchNorm2d-13           [-1, 64, 56, 56]             128
             ReLU-14           [-1, 64,

## Build LSTM + Embedding

In [30]:
vocab_size = 9631

class LSTMCaptioning(nn.Module):
    
    def __init__(self, input_size, hidden_size):
        self.lstm = nn.LSTM(input_size, hidden_size)
        
    def forward(self, inputs):
         
        # Get hidden states for each t (out) , and latest one (h = (ht, ct))
        out, ht = self.lstm(inputs.view(len(inputs), 1, -1))
        
        # Compute probability distribution over all words for this t
        pt = F.softmax(ht, dim=1)
                           
        return out, ht, pt

embedding = nn.Embedding(vocab_size, 25088)

## Train model

In [None]:
input_size = (3, 224, 224)
hideen_size = ...

model = LSTMCaptioning(input_size, hidden_size)

num_epoch = 10
loss_function = nn.NLLLoss()

for epoch in range(num_epoch):
    for image, caption in dataset:
        
        # Reset grad
        model.zero_grad()
        
        # Get the input image embedding 
        image_embedding = features(image)
        
        # Forward pass for t=-1: image
        out, hidden, probs = model(image_embedding)
        
        # Forward pass for t>=0: words
        for word in caption:
            word_vector = preprocess(word)
            word_embedding = embedding(word_vector)
        
        # Compute loss and backprop
        loss = loss_function(out, targets)
        loss.backward()
        optimizer.step()
        

In [None]:
# Save model 
torch.save(model.state_dict(), './models/')