## Imports

In [31]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.models as m
from torch.utils.data import DataLoader
from transforms import *
from torchvision.transforms import Compose
from torchsummary import summary
from random_caption_dataset import RandomCaptionDataset
from text_preprocessing import *

print(f'PyTorch version: {torch.__version__}')
print("GPU found :)" if torch.cuda.is_available() else "No GPU :(")

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device='cpu'

PyTorch version: 1.7.1
GPU found :)


In [32]:
IMAGE_SIZE = 224
EMBEDDING_SIZE = 512
train_annotations_file = './flickr8k/annotations/annotations_image_id_train.csv'
test_annotations_file = './flickr8k/annotations/annotations_image_id_test.csv'

## Datas section

In [33]:
# Init text preprocessing class
tp = TextPreprocessor(train_annotations_file, sep=';')

In [34]:
transforms = Compose([Rescale(256), 
                      RandomCrop(IMAGE_SIZE), 
                      ToTensor(), Normalize(),
                      OneHotEncode(tp)])

train_random_dataset = RandomCaptionDataset('./flickr8k/images/train/', train_annotations_file, transform=transforms)

print(f'Random dataset size: {len(train_random_dataset)}')

Random dataset size: 6000


In [35]:
batch_size = 1

# Build data loader
train_random_loader = DataLoader(train_random_dataset, batch_size=batch_size)

## Model section

In [36]:
# retrieve pretrained model for features extraction
base_cnn = m.resnet18(pretrained=True)
#base_cnn

In [37]:
# Keep only the feature extraction layers of the model
cnn = nn.Sequential(*(list(base_cnn.children())[:-1])).to(device, dtype=torch.float)
#summary(cnn, (3, IMAGE_SIZE, IMAGE_SIZE))

## Build LSTM + Embedding

In [38]:
vocab_size = tp.vocab_size
print(vocab_size)

# RNN with LSTM of  layer
class LSTMCaptioning(nn.Module):
    
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMCaptioning, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size)
        self.hidden2out = nn.Linear(hidden_size, output_size)
        
    def forward(self, x, previous_state):
         
        # Get hidden states for each t (out) , and latest one (h = (ht, ct))
        lstm_out, (hn, cn) = self.lstm(x, previous_state)
        
        # Convert output of rnn to output targeted size
        out = self.hidden2out(lstm_out.view(1, -1))
        
        # Compute probability distribution over all words for this t
        pt = F.log_softmax(out, dim=1)
                           
        return (hn, cn), pt

# Not trained embedding layer to encode words to hidden space
embedding = nn.Embedding(vocab_size, EMBEDDING_SIZE).to(device)

8255


## Train model

In [39]:
input_size = EMBEDDING_SIZE
hidden_size = 300

model = LSTMCaptioning(input_size, hidden_size, vocab_size).to(device, dtype=torch.float)

In [11]:
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

num_epoch = 10
step_count = len(train_random_loader)
loss_function = nn.NLLLoss()

# Random init the lstm state
h0 = torch.rand((1, batch_size, hidden_size)).to(device, dtype=torch.float)
c0 = torch.rand((1, batch_size, hidden_size)).to(device, dtype=torch.float)


for epoch in range(num_epoch):
    for i, sample in enumerate(train_random_loader):
        
        image = sample['image'].to(device, dtype=torch.float)
        caption = sample['caption'].to(device, dtype=torch.long)
        
        # Reset grad
        model.zero_grad()
        
        # Get the input image embedding 
        image_embedding = cnn(image).view(-1, batch_size, EMBEDDING_SIZE)
        
        
        # Forward pass for t=-1: image
        (hn, cn), probs = model(image_embedding, (h0, c0))
        
        del image_embedding
        del image
        
        target = tp.target_from_vect(caption[:, 0]).to(device)
        
        # Compute loss for 1st word prediction
        loss = loss_function(probs, target)
        
        # Forward pass for t>=0: n - 1 first words of the sentence
        for j, word in enumerate(caption[:, :-1]):

            
            # Get index of the word in embedding matrix
            idxs = torch.argmax(word)
            
            # Encode word to hidden space
            word_embedding = embedding(idxs).view(1, batch_size, EMBEDDING_SIZE)
            
            # Feed the rnn
            (hn, cn), probs = model(word_embedding, (hn, cn))
            
            target = tp.target_from_vect(caption[:, j+1]).to(device)
            
            # Add current word's loss
            loss += loss_function(probs, target)

        
        # Compute loss and backprop
        loss.backward()
        optimizer.step()
        
        
        # Debug
        if((i+1) % (step_count/10) == 0):
            print(
                        f"Epoch [{epoch + 1}/{num_epoch}]"
                        f", step [{i + 1}/{step_count}]"
                        f", loss: {loss.item():.4f}"
                    )
        

Epoch [1/10], step [600/6000], loss: 0.9465
Epoch [1/10], step [1200/6000], loss: 1.8174
Epoch [1/10], step [1800/6000], loss: 0.3157
Epoch [1/10], step [2400/6000], loss: 0.3442
Epoch [1/10], step [3000/6000], loss: 0.4863
Epoch [1/10], step [3600/6000], loss: 0.4348
Epoch [1/10], step [4200/6000], loss: 2.2075
Epoch [1/10], step [4800/6000], loss: 16.5890
Epoch [1/10], step [5400/6000], loss: 2.9704
Epoch [1/10], step [6000/6000], loss: 2.3771
Epoch [2/10], step [600/6000], loss: 0.5536
Epoch [2/10], step [1200/6000], loss: 1.7598
Epoch [2/10], step [1800/6000], loss: 0.3626
Epoch [2/10], step [2400/6000], loss: 17.1973
Epoch [2/10], step [3000/6000], loss: 0.4023
Epoch [2/10], step [3600/6000], loss: 0.5249
Epoch [2/10], step [4200/6000], loss: 2.1344
Epoch [2/10], step [4800/6000], loss: 16.1560
Epoch [2/10], step [5400/6000], loss: 0.5574
Epoch [2/10], step [6000/6000], loss: 0.4632
Epoch [3/10], step [600/6000], loss: 0.5931
Epoch [3/10], step [1200/6000], loss: 1.6696
Epoch [3/1

In [13]:
# Save model 
torch.save(model.state_dict(), './models/model_v1_random')

## Test model performances

In [40]:
# Load model for evaluation
trained_model = LSTMCaptioning(input_size, hidden_size, vocab_size)
trained_model.load_state_dict(torch.load('./models/model_v1_random'))
trained_model.to(device)

LSTMCaptioning(
  (lstm): LSTM(512, 300)
  (hidden2out): Linear(in_features=300, out_features=8255, bias=True)
)

In [41]:
# Create test loaders for datasets

# Only preprocess images
test_transforms = Compose([Rescale(256), 
                      RandomCrop(IMAGE_SIZE), 
                      ToTensor(), 
                      Normalize()])

test_random_dataset = RandomCaptionDataset('./flickr8k/images/test', test_annotations_file, transform=test_transforms)

test_random_loader = DataLoader(test_random_dataset, batch_size=batch_size)

In [43]:
with torch.no_grad():
    
    for sample in test_random_loader:
        
        caption = list()

        # Random init the lstm state
        h0 = torch.rand((1, batch_size, hidden_size)).to(device, dtype=torch.float)
        c0 = torch.rand((1, batch_size, hidden_size)).to(device, dtype=torch.float)

        # Encode input image
        image = sample['image'].to(device, dtype=torch.float)
        image_embedding = cnn(image).view(-1, batch_size, EMBEDDING_SIZE).to(device)

        # Get first word prediction probabilities
        (hn, cn), probs = model(image_embedding, (h0, c0))

        # Extract predicted word
        pred_idx = torch.argmax(probs)
        pred_word_vect = tp.encoding_matrix[pred_idx]
        predicted_word = tp.vect_to_word(pred_word_vect)

        caption.append(predicted_word)
        
        print(predicted_word)
        print(pred_idx)
        
        i = 0
        # Build caption until model outputs stop word
        while predicted_word != '<stop>' and i < 20:

            word_embedding = embedding(pred_idx).view(1, batch_size, EMBEDDING_SIZE).to(device)

            (hn, cn), probs = model(word_embedding, (hn, cn))

            pred_idx = torch.argmax(probs)
            pred_word_vect = tp.encoding_matrix[pred_idx]
            predicted_word = tp.vect_to_word(pred_word_vect)

            caption.append(predicted_word)

            print(predicted_word)
            print(pred_idx)
            
            i += 1

        caption = " ".join(caption)

        print(caption)
        
        break

situated
tensor(6631, device='cuda:0')
rodent
tensor(6167, device='cuda:0')
cheered
tensor(2094, device='cuda:0')
window
tensor(8119, device='cuda:0')
enclosure
tensor(3022, device='cuda:0')
checker
tensor(2085, device='cuda:0')
kite
tensor(4354, device='cuda:0')
hind
tensor(3988, device='cuda:0')
Green
tensor(382, device='cuda:0')
footprints
tensor(3443, device='cuda:0')
autumn
tensor(1260, device='cuda:0')
bodyboard
tensor(1640, device='cuda:0')
Christ
tensor(216, device='cuda:0')
catc
tensor(2010, device='cuda:0')
glowing
tensor(3641, device='cuda:0')
locking
tensor(4591, device='cuda:0')
bubbles
tensor(1803, device='cuda:0')
collar
tensor(2278, device='cuda:0')
acrobatics
tensor(981, device='cuda:0')
Time
tensor(861, device='cuda:0')
array
tensor(1189, device='cuda:0')
situated rodent cheered window enclosure checker kite hind Green footprints autumn bodyboard Christ catc glowing locking bubbles collar acrobatics Time array
