In [1]:
# Codeblock 1
import torch  #(1)
import torch.nn as nn  #(2)
import torchvision.models as models  #(3)
from torchvision.models import GoogLeNet_Weights  #(4)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Codeblock 2
EMBED_DIM       = 512    #(1)
LSTM_HIDDEN_DIM = 512    #(2)
NUM_LSTM_LAYERS = 1     #(3)

IMAGE_SIZE      = 224    #(4)
IN_CHANNELS     = 3      #(5)

SEQ_LENGTH      = 30     #(6)
VOCAB_SIZE      = 10000  #(7)

BATCH_SIZE      = 1

In [3]:
# Codeblock 3
models.googlenet()



GoogLeNet(
  (conv1): BasicConv2d(
    (conv): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (maxpool1): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
  (conv2): BasicConv2d(
    (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (conv3): BasicConv2d(
    (conv): Conv2d(64, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(192, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (maxpool2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
  (inception3a): Inception(
    (branch1): BasicConv2d(
      (conv): Conv2d(192, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track

In [9]:
# Codeblock 4a
class InceptionEncoder(nn.Module):
    def __init__(self, fine_tune):  #(1)
        super().__init__()
        self.googlenet = models.googlenet(weights=GoogLeNet_Weights.IMAGENET1K_V1)  #(2)
        self.googlenet.fc = nn.Linear(in_features=self.googlenet.fc.in_features,  #(3)
                                      out_features=EMBED_DIM)  #(4)
        
        if fine_tune == True:  #(5)
            for param in self.googlenet.parameters():
                param.requires_grad = True
        else:
            for param in self.googlenet.parameters():
                param.requires_grad = False

        for param in self.googlenet.fc.parameters():
            param.requires_grad = True
            
# Codeblock 4b
    def forward(self, images):
        #print(f'original\t: {images.size()}')
        features = self.googlenet(images)
        #print(f'after googlenet\t: {features.size()}')
        
        return features

In [5]:
# Codeblock 5
inception_encoder = InceptionEncoder(fine_tune=True)

images = torch.randn(BATCH_SIZE, IN_CHANNELS, IMAGE_SIZE, IMAGE_SIZE)
features = inception_encoder(images)

original	: torch.Size([1, 3, 224, 224])
after googlenet	: torch.Size([1, 512])


In [10]:
# Codeblock 6a
class LSTMDecoder(nn.Module):
    def __init__(self):
        super().__init__()

        #(1)
        self.embedding = nn.Embedding(num_embeddings=VOCAB_SIZE,
                                      embedding_dim=EMBED_DIM)
        #(2)
        self.lstm = nn.LSTM(input_size=EMBED_DIM, 
                            hidden_size=LSTM_HIDDEN_DIM, 
                            num_layers=NUM_LSTM_LAYERS, 
                            batch_first=True)
        #(3)        
        self.linear = nn.Linear(in_features=LSTM_HIDDEN_DIM, 
                                out_features=VOCAB_SIZE)
        
# Codeblock 6b
    def forward(self, features, captions):  #(1)
        #print(f'features original\t: {features.size()}')
        features = features.unsqueeze(1)  #(2)
        #print(f"after unsqueeze\t\t: {features.shape}")
        
        #print(f'captions original\t: {captions.size()}')
        captions = self.embedding(captions)  #(3)
        #print(f"after embedding\t\t: {captions.shape}")
        
        captions = torch.cat([features, captions], dim=1)  #(4)
        #print(f"after concat\t\t: {captions.shape}")
        
        captions, _ = self.lstm(captions)  #(5)
        #print(f"after lstm\t\t: {captions.shape}")
        
        captions = self.linear(captions)  #(6)
        #print(f"after linear\t\t: {captions.shape}")
        
        return captions

In [7]:
# Codeblock 7
lstm_decoder = LSTMDecoder()

features = torch.randn(BATCH_SIZE, EMBED_DIM)  #(1)
captions = torch.randint(0, VOCAB_SIZE, (BATCH_SIZE, SEQ_LENGTH))  #(2)

captions = lstm_decoder(features, captions)

features original	: torch.Size([1, 512])
after unsqueeze		: torch.Size([1, 1, 512])
captions original	: torch.Size([1, 30])
after embedding		: torch.Size([1, 30, 512])
after concat		: torch.Size([1, 31, 512])
after lstm		: torch.Size([1, 31, 512])
after linear		: torch.Size([1, 31, 10000])


In [8]:
# Codeblock 8a
class ShowAndTell(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = InceptionEncoder(fine_tune=True)  #(1)
        self.decoder = LSTMDecoder()  #(2)
    
    def forward(self, images, captions):
        features = self.encoder(images)  #(3)
        print(f"after encoder\t: {features.shape}")
        
        captions = self.decoder(features, captions)  #(4)
        print(f"after decoder\t: {captions.shape}")
        
        return captions
    
# Codeblock 8b
    def generate(self, images):  #(1)
        features = self.encoder(images)  #(2)
        print(f"after encoder\t\t: {features.shape}\n")
        
        words = []  #(3)
        for i in range(SEQ_LENGTH):  #(4)
            print(f"iteration #{i}")
            features = features.unsqueeze(1)
            print(f"after unsqueeze\t\t: {features.shape}")
            
            features, _ = self.decoder.lstm(features)
            print(f"after lstm\t\t: {features.shape}")
            
            features = features.squeeze(1)  #(5)
            print(f"after squeeze\t\t: {features.shape}")
            
            probs = self.decoder.linear(features)  #(6)
            print(f"after linear\t\t: {probs.shape}")
            
            _, word = probs.max(dim=1)  #(7)
            print(f"after max\t\t: {word.shape}")
            
            words.append(word.item())  #(8)
            
            if word == 1:  #(9)
                break
            
            features = self.decoder.embedding(word)  #(10)
            print(f"after embedding\t\t: {features.shape}\n")
        
        return words  #(11)

In [11]:
# Codeblock 9
show_and_tell = ShowAndTell()  #(1)

images = torch.randn(BATCH_SIZE, IN_CHANNELS, IMAGE_SIZE, IMAGE_SIZE)  #(2)
captions = torch.randint(0, VOCAB_SIZE, (BATCH_SIZE, SEQ_LENGTH))  #(3)

captions = show_and_tell(images, captions)

after encoder	: torch.Size([1, 512])
after decoder	: torch.Size([1, 31, 10000])


In [12]:
# Codeblock 10
show_and_tell.eval()  #(1)

images = torch.randn(BATCH_SIZE, IN_CHANNELS, IMAGE_SIZE, IMAGE_SIZE)  #(2)

with torch.no_grad():
    generated_tokens = show_and_tell.generate(images)  #(3)

after encoder		: torch.Size([1, 512])

iteration #0
after unsqueeze		: torch.Size([1, 1, 512])
after lstm		: torch.Size([1, 1, 512])
after squeeze		: torch.Size([1, 512])
after linear		: torch.Size([1, 10000])
after max		: torch.Size([1])
after embedding		: torch.Size([1, 512])

iteration #1
after unsqueeze		: torch.Size([1, 1, 512])
after lstm		: torch.Size([1, 1, 512])
after squeeze		: torch.Size([1, 512])
after linear		: torch.Size([1, 10000])
after max		: torch.Size([1])
after embedding		: torch.Size([1, 512])

iteration #2
after unsqueeze		: torch.Size([1, 1, 512])
after lstm		: torch.Size([1, 1, 512])
after squeeze		: torch.Size([1, 512])
after linear		: torch.Size([1, 10000])
after max		: torch.Size([1])
after embedding		: torch.Size([1, 512])

iteration #3
after unsqueeze		: torch.Size([1, 1, 512])
after lstm		: torch.Size([1, 1, 512])
after squeeze		: torch.Size([1, 512])
after linear		: torch.Size([1, 10000])
after max		: torch.Size([1])
after embedding		: torch.Size([1, 512]

In [13]:
# Codeblock 11
generated_tokens

[2828,
 5801,
 5382,
 4926,
 3579,
 833,
 9661,
 6256,
 3419,
 276,
 6689,
 3184,
 4992,
 7979,
 1226,
 3854,
 6157,
 6445,
 4779,
 9700,
 5771,
 1300,
 9261,
 1996,
 1248,
 9741,
 8187,
 3596,
 4969,
 4950]