In [1]:
# !pip install spacy

In [3]:
import spacy

In [4]:
# !python -m spacy download en_core_web_lg
# !python -m spacy download en_core_web_sm

In [5]:
spacy_english = spacy.load('en_core_web_sm')

In [6]:
text = "This is a good place to find a city"

In [7]:
[token.text.lower() for token in spacy_english.tokenizer(text)]

['this', 'is', 'a', 'good', 'place', 'to', 'find', 'a', 'city']

### Defining a Vocabulary class

In [8]:
from collections import Counter

In [9]:
class Vocab:
    def __init__(self,freq_count,tokenizer):
        
        self.word2idx = {"<pad>":0,"<start>":1,"<end>":2,"<unk>":3}
        self.idx2word = {v:k  for k,v in self.word2idx.items()}
        self.tokenizer = tokenizer
     
        self.freq_threshold = freq_count
        
    def __len__(self):
        return len(self.word2idx)
    
    def tokenize_text(self,text):
        
        return [token.text.lower() for token in self.tokenizer(text)]
    
    def build_dictionary(self,sentences:list):
        """Takes in sentences of list as input
        """
        index = 4
        word_count = Counter()
        
        for sent in sentences:
            for word in self.tokenize_text(sent):
                word_count[word]+=1
                
                if word_count[word] == self.freq_threshold:
                    self.word2idx[word]=index
                    self.idx2word[index]=word
                    index+=1
                    
    def encode_text(self,text):
        
        tok_text = self.tokenize_text(text)
        
        return [self.word2idx[word] if word in self.word2idx.keys() else self.word2idx['<unk>']  for word in tok_text]
    
                
        

### Reading Dataset

In [10]:
import pandas as pd
import numpy as np

In [11]:
data = pd.read_csv('captions.txt')

In [12]:
data.head()

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...


In [13]:
vocab =Vocab(1,spacy_english.tokenizer)

In [14]:
vocab.build_dictionary(list(data['caption']))

In [15]:
len(vocab)

8508

In [16]:
vocab.encode_text("Heyllo I am daniyal")

[3, 458, 3460, 3]

### Reading Image

In [17]:
from PIL import Image

In [18]:
image_path = './Images'

In [19]:
image = data['image'][0]

In [20]:
image

'1000268201_693b08cb0e.jpg'

In [21]:
img = Image.open(image_path+"/"+image).convert("RGB")

In [22]:
img.show()

### Spliting Data set

In [23]:
train, validate, test = \
              np.split(data.sample(frac=1, random_state=42), 
                       [int(.6*len(data)), int(.8*len(data))])

In [24]:
len(train),len(validate),len(test),len(data)

(24273, 8091, 8091, 40455)

### Creating Pytorch Dataset

In [25]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader,Dataset
import torchvision.transforms as T

In [26]:
def pad_sequence(sent,max_len,pad_token):
    
    pad_num = max_len - len(sent)
    sent.extend([pad_token for i in range(pad_num)])
    
    return sent
    

In [27]:
class CaptionDataset(Dataset):
    def __init__(self,image_path,df,transform,vocab,max_len):
        super().__init__()
        
        self.image_path = image_path
        self.df = df
        self.transform = transform
        self.vocab = vocab
        self.max_len = max_len
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self,idx):
        
        data_item = self.df.iloc[idx]
        
        image = Image.open(self.image_path + "/" + data_item['image']).convert("RGB")
        
        image = self.transform(image)
        
        text = [self.vocab.word2idx['<start>']]
        text.extend(self.vocab.encode_text(data_item['caption']))
        train_caption = text.copy()
        text.append(self.vocab.word2idx['<end>'])
        
        text = pad_sequence(text,self.max_len,self.vocab.word2idx['<pad>'])
        train_caption = pad_sequence(train_caption,self.max_len,self.vocab.word2idx['<pad>'])
        
        return image,torch.tensor(text,dtype=torch.long),\
                torch.tensor(train_caption,dtype=torch.long)
        
        

In [28]:
transform = T.Compose([
  T.Resize(226),                     
    T.RandomCrop(224),                 
    T.ToTensor(),                               
    T.Normalize((0.485, 0.456, 0.406),(0.229, 0.224, 0.225))
])

In [29]:
train_dataset= CaptionDataset(image_path,
                              train,
                              transform,
                              vocab,
                              42
)

In [30]:
valid_dataset= CaptionDataset(image_path,
                              validate,
                              transform,
                              vocab,
                              42
)

In [31]:
test_dataset= CaptionDataset(image_path,
                              test,
                              transform,
                              vocab,
                              42
)

In [32]:
image,text,train_caption = train_dataset[0]

In [33]:
image.size(),text.size(),train_caption.size()

(torch.Size([3, 224, 224]), torch.Size([42]), torch.Size([42]))

### Creating DataLoader for Pytorch Training

In [34]:
BATCH_SIZE = 32

In [35]:
train_dataloader = DataLoader(train_dataset,batch_size=BATCH_SIZE)
valid_dataloader = DataLoader(valid_dataset,batch_size=BATCH_SIZE)
test_dataloader = DataLoader(test_dataset,batch_size=BATCH_SIZE)


## Creating a Model

In [36]:
import os
import numpy as np
import torch
import torch.nn as nn
import torchvision

In [37]:
model_res = torchvision.models.resnet50(pretrained = True)



In [38]:
# list(model_res.children())[:-1]

In [39]:
class Encoder(nn.Module):
    def __init__(self,embedding_size):
        super(Encoder,self).__init__()
        
        pretrained = torchvision.models.resnet50(pretrained = True)
        for param in pretrained.parameters():
            param.requires_grad_(False)
            
        self.resnet = nn.Sequential(*(list(pretrained.children())[:-1]))
        self.features = nn.Linear(pretrained.fc.in_features,embedding_size)
        self.batch_norm = nn.BatchNorm1d(embedding_size)
        
        
    def forward(self,image):
        
        feat = self.resnet(image)
        feat = feat.view(feat.size(0),-1)
        feat = self.features(feat)
        feat = self.batch_norm(feat)
        
        return feat
        
    

In [40]:
enc = Encoder(200)

In [41]:
sample = torch.rand((2,3,224,224))
out = enc(sample)
print(out.size())

torch.Size([2, 200])


In [42]:
class Decoder(nn.Module):
    def __init__(self,embed,hidden,vocab_size,layers= 1):
        super(Decoder,self).__init__()
        
        self.layers = layers
        self.embedding_size = embed
        self.hidden_size = hidden
        self.vocab_size = vocab_size
        
        self.embedding = nn.Embedding(self.vocab_size,self.embedding_size,\
                                      padding_idx =0)
        
        self.lstm = nn.LSTM(self.embedding_size,self.hidden_size,self.layers)
        self.final_layer = nn.Linear(self.hidden_size,self.vocab_size)
        
    def forward(self,feats,train_captions):
#         print(feats.size())
        feats = feats.unsqueeze(1)
#         print(feats.size())
        embeddings = self.embedding(train_captions)
        embeddings = torch.cat((feats,embeddings[:,:-1,:]),dim=1)
        
        h,_ = self.lstm(embeddings)
        final_output = self.final_layer(h)
        
        return final_output
    

In [43]:
class EncoderDecoder(nn.Module):
    def __init__(self,vocab,embedding_size,hidden_dim,num_layers=1):
        super(EncoderDecoder,self).__init__()
        
        self.encoder = Encoder(embedding_size)
        self.decoder = Decoder(embedding_size,hidden_dim,vocab,num_layers)
        
    def forward(self,image,caption):
        
        feat = self.encoder(image)
        out = self.decoder(feat,caption)
        
        return out
    
    def predict_caption(self,image,vocab,max_length =40):
        
        caption = []
        
        with torch.no_grad():
            self.encoder.eval()
            var = self.encoder(image.unsqueeze(0)).unsqueeze(0)
            h_c = None
            
            for _ in range(max_length):
                hidden_out,h_c = self.decoder.lstm(var,h_c)
#                 print(hidden_out.size())
                output = self.decoder.final_layer(hidden_out.squeeze(0))
#                 print(output.size())
                pred = output.argmax(1)
                caption.append(pred.detach().cpu().item())
                var =self.decoder.embedding(pred).unsqueeze(0)
                
                if vocab.idx2word[pred.item()] == "<end>":
                    break
            return caption

In [44]:
testing = EncoderDecoder(vocab = 8508,embedding_size = 128,
                        hidden_dim = 128,num_layers =1)

In [45]:
sample_image,sample_caption = torch.rand((5,3,224,224)),torch.randint(0,8507,(5,42),dtype=torch.long)

In [46]:
output = testing(sample_image,sample_caption)

### Zero Shot Caption Generation

In [47]:
image,_,_ = train_dataset[0]

In [48]:
image.size()

torch.Size([3, 224, 224])

In [49]:
caption = testing.predict_caption(image,vocab)

In [50]:
caption = [vocab.idx2word[c] for c in caption]
print(caption)

['pedestrian', 'stringless', 'multistory', 'dandilions', 'hat', 'min', 'great', 'skips', 'confetti', 'intently', 'racquet', 'kneels', 'elder', 'lots', 'rollercoaster', 'elder', 'purchasing', 'board', 'emerging', 'promting', 'shirtness', 'iceskate', 'stood', 'wipes', 'soil', 'nipples', 'skiiers', 'skies', 'divided', 'casque', 'casque', 'teeth', 'sheets', 'waterskis', 'oar', 'midway', 'programs', 'rappeling', 'marketplace', 'waterspouts']


### Training

### Config

In [59]:
learning_rate = 2e-4    
embedding_size = 256
hidden_dim = 256
num_layers = 1
epochs = 100
loss_fn = nn.CrossEntropyLoss(ignore_index = vocab.word2idx['pad'])

### Model Initialization

In [51]:
model = EncoderDecoder(vocab = 8508,embedding_size = embedding_size,
                        hidden_dim = hidden_dim,num_layers =num_layers)

#### Training Phase

In [68]:
from tqdm import tqdm

In [61]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [62]:
model = model.to(device)

In [60]:
optim = torch.optim.Adam(model.parameters(),lr= learning_rate)

In [71]:
def model_train_fn(model,epochs,dataset,loss_fn,optim,schedular = None):
    """
    model: The CNN and LSTM(Encoder Decoder Function)
    epochs: Number of iteration of training
    """
    
    model.train()
    
    loss_values = []
    
    for i in tqdm(range(epochs)):
        loss_per_batch = []
        for (image,output_text,train_text) in dataset:
            image,output_text,train_text = image.to(device),output_text.to(device),train_text.to(device)
            
            output = model(image,train_text)
            
            loss = loss_fn(output.reshape(-1,output.shape[2]),output_text.reshape(-1))
            optim.zero_grad()
            loss.backward()
            optim.step()
            
            loss_per_batch.append(loss.detach().cpu().item())
        print('Epoch {}: Train loss: {}'.format(i,np.mean(loss_per_batch)))
        loss_values.append(np.mean(loss_per_batch))
        
    

In [72]:
loss_values = model_train_fn(model,epochs,train_dataloader,loss_fn,optim)

  0%|                                                                                          | 0/100 [00:00<?, ?it/s]

tensor(9.0569, grad_fn=<NllLossBackward0>)
tensor(9.0575, grad_fn=<NllLossBackward0>)
tensor(9.0502, grad_fn=<NllLossBackward0>)
tensor(9.0455, grad_fn=<NllLossBackward0>)
tensor(9.0381, grad_fn=<NllLossBackward0>)
tensor(9.0332, grad_fn=<NllLossBackward0>)
tensor(9.0328, grad_fn=<NllLossBackward0>)
tensor(9.0271, grad_fn=<NllLossBackward0>)
tensor(9.0112, grad_fn=<NllLossBackward0>)


  0%|                                                                                          | 0/100 [00:19<?, ?it/s]


KeyboardInterrupt: 