In [1]:
import numpy as np
import pandas as pd
import torch
import time
import os

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.lm import Vocabulary
from PIL import Image

import re
import spacy
import string
from sklearn.model_selection import train_test_split

from torchvision import models
import torchvision.transforms as transforms
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

  warn(


In [2]:
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Dataset Preprocessing

In [3]:
df = pd.read_csv('./flickr30k_images/results.csv', delimiter='|')
df.head()

Unnamed: 0,image_name,comment_number,comment
0,1000092795.jpg,0,Two young guys with shaggy hair look at their...
1,1000092795.jpg,1,"Two young , White males are outside near many..."
2,1000092795.jpg,2,Two men in green shirts are standing in a yard .
3,1000092795.jpg,3,A man in a blue shirt standing in a garden .
4,1000092795.jpg,4,Two friends enjoy time spent together .


In [4]:
df.columns = ['image_name', 'comment_number', 'comment']

In [5]:
df.dropna(inplace=True)
print(df.isnull().sum())
print(df.duplicated().sum())
print(df.shape)

image_name        0
comment_number    0
comment           0
dtype: int64
0
(158914, 3)


In [6]:
## Preprocessing

df['image_name'] = df['image_name'].str.strip()
df['comment_number'] = df['comment_number'].str.strip()
df['comment'] = df['comment'].str.strip()

def remove_tags(text):
    cleaned_text = re.sub(re.compile('<.*?>'), '', text)
    return cleaned_text

df['comment'] = df['comment'].apply(remove_tags)
df['comment'] = df['comment'].str.lower()

exclude = string.punctuation

def remove_punctuations(text):
    return text.translate(str.maketrans('', '', exclude))

df['comment'] = df['comment'].apply(remove_punctuations)

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

df['comment'] = df['comment'].apply(remove_numbers)

print(df.shape)
df.head()

(158914, 3)


Unnamed: 0,image_name,comment_number,comment
0,1000092795.jpg,0,two young guys with shaggy hair look at their ...
1,1000092795.jpg,1,two young white males are outside near many b...
2,1000092795.jpg,2,two men in green shirts are standing in a yard
3,1000092795.jpg,3,a man in a blue shirt standing in a garden
4,1000092795.jpg,4,two friends enjoy time spent together


In [7]:
df['comment'][1]

'two young  white males are outside near many bushes '

### DataLoader

In [8]:
spacy_eng = spacy.load("en_core_web_sm")

In [9]:
class CustomVocabulary:
    def __init__(self, freq_threshold):
        self.itos = {0:"<PAD>", 1:"<SOS>", 2:"<EOS>", 3:"<UNK>"}
        self.stoi = {"<PAD>":0, "<SOS>":1, "<EOS>":2, "<UNK>":3}
        self.freq_threshold = freq_threshold

    def __len__(self):
        return len(self.itos)

    @staticmethod
    def tokenizer_eng(text):
        return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]

    def build_vocabulary(self, sentence_list):
        all_tokens = [token for sentence in sentence_list for token in self.tokenizer_eng(sentence)]

        # Create a Vocabulary object from nltk.lm with the tokens and frequency threshold
        vocab = Vocabulary(all_tokens, unk_cutoff=self.freq_threshold)

        # Create mappings from word to index and index to word
        idx = 4  # Starting index after predefined tokens
        for word in vocab:
            if word not in self.stoi:  # Avoid overwriting special tokens
                self.stoi[word] = idx
                self.itos[idx] = word
                idx += 1

    def numericalize(self, text):
        tokenized_text = self.tokenizer_eng(text)
        return [self.stoi.get(token, self.stoi["<UNK>"]) for token in tokenized_text]

In [10]:
vocab = CustomVocabulary(freq_threshold=8)
all_comments = df['comment'].tolist()
vocab.build_vocabulary(all_comments)

In [11]:
class FlickrDataset(Dataset):
    def __init__(self, root_dir, df, vocab, transform=None):
        self.root_dir = root_dir
        self.df = df
        self.vocab = vocab
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path = os.path.join(self.root_dir, self.df['image_name'][idx])
        image = Image.open(img_path).convert('RGB')

        comment = self.df['comment'][idx]
        
        numericalized_comment = [self.vocab.stoi["<SOS>"]]
        numericalized_comment += self.vocab.numericalize(comment)
        numericalized_comment.append(self.vocab.stoi["<EOS>"])

        if self.transform:
            image = self.transform(image)

        return image, torch.tensor(numericalized_comment)

In [12]:
class MyCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx
        
    def __call__(self, batch):
        images = [item[0].unsqueeze(0) for item in batch]
        images = torch.cat(images, dim=0)
        comments = [item[1] for item in batch]
        comments = pad_sequence(comments, batch_first=False, padding_value=self.pad_idx)
        
        return images, comments

In [13]:
def get_loader(root_dir, df, vocab, transform=None, batch_size=32, shuffle=False, drop_last=True):
    dataset = FlickrDataset(root_dir, df, vocab, transform)
    pad_idx = vocab.stoi["<PAD>"]
    loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, collate_fn=MyCollate(pad_idx))
    return loader

In [14]:
df = df[['image_name', 'comment']]

train_ratio = 0.9

train_df, test_df = train_test_split(df, test_size=1-train_ratio, random_state=42)
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

print(train_df.shape, test_df.shape)

(143022, 2) (15892, 2)


In [15]:
train_df.tail()

Unnamed: 0,image_name,comment
143017,4827151208.jpg,a young man wearing a light blue tank top and ...
143018,4548479186.jpg,a woman in white with a big white flower in he...
143019,519061891.jpg,a young child splashes in a green and yellow w...
143020,6907188365.jpg,a man makes a diving catch during a game of ul...
143021,4859164621.jpg,people use phone banks outdoors


In [16]:
transform = transforms.Compose(
    [
        transforms.Resize((356, 356)),
        transforms.RandomCrop((299, 299)),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ]
)

In [17]:
train_loader = get_loader('./flickr30k_images/flickr30k_images/', train_df, vocab, transform=transform, batch_size=64, shuffle=True, drop_last=True)
test_loader = get_loader('./flickr30k_images/flickr30k_images/', test_df, vocab, transform=transform, batch_size=64, shuffle=False, drop_last=False)

print(len(train_loader))
print(len(test_loader))

2234
249


In [18]:
for images, comments in train_loader:
    print(images.shape)
    print(comments.shape)
    break

print('-----------------')

for images, comments in test_loader:
    print(images.shape)
    print(comments.shape)
    break

torch.Size([64, 3, 299, 299])
torch.Size([60, 64])
-----------------
torch.Size([64, 3, 299, 299])
torch.Size([29, 64])


### Model Architecture

In [25]:
class EncoderCNN(nn.Module):
    def __init__(self, embed_size, train_CNN=False):
        super(EncoderCNN, self).__init__()
        self.train_CNN = train_CNN
        self.inception = models.inception_v3(pretrained=True, aux_logits=True)
        self.inception.fc = nn.Linear(self.inception.fc.in_features, embed_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, images):
        features = self.inception(images)
        
        for name, param in self.inception.named_parameters():
            if "fc.weight" in name or "fc.bias" in name:
                param.requires_grad = True
            else:
                param.requires_grad = self.train_CNN
                
        return self.dropout(self.relu(features[0]))

In [26]:
class DecoderRNN(nn.Module):
    
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(DecoderRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, features, captions):
        embedings = self.dropout(self.embed(captions))
        embedings = torch.cat((features.unsqueeze(0), embedings), dim=0)
        hidden, _ = self.lstm(embedings)
        outputs = self.linear(hidden)
        return outputs

In [27]:

class CNNtoRNN(nn.Module):
    
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(CNNtoRNN, self).__init__()
        self.encoderCNN = EncoderCNN(embed_size)
        self.decoderRNN = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers)
        
    def forward(self, images, captions):
        featured = self.encoderCNN(images)
        outputs = self.decoderRNN(featured, captions)
        return outputs
    
    def caption_image(self, image, vocabulary, max_length=50):
        result_caption = []
        with torch.no_grad():
            x = self.encoderCNN(image).unsqueeze(0)
            states = None
            
            for _ in range(max_length):
                hiddens, states = self.decoderRNN.lstm(x, states)
                output = self.decoderRNN.linear(hiddens).squeeze(0)
                predicted = output.argmax(1)
                
                result_caption.append(predicted.item())
                x = self.decoderRNN.embed(predicted).unsqueeze(0)
                
                if vocabulary.itos[predicted.item()] == "<EOS>":
                    break
                    
        return [vocabulary.itos[idx] for idx in result_caption]

### Training & Evaluation

In [28]:
## Hyperparameters

embed_size = 256
hidden_size = 256
vocab_size = len(vocab)
num_layers=1
learning_rate = 3e-4
NUM_EPOCHS = 100

DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(DEVICE)

cpu


In [29]:
model = CNNtoRNN(embed_size, hidden_size, vocab_size, num_layers).to(DEVICE)
criterion = nn.CrossEntropyLoss(ignore_index=vocab.stoi["<PAD>"])
optimizer = optim.Adam(model.parameters(), lr=learning_rate)



In [34]:
model.train()

for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0
    step = 0
    for idx, (imgs, captions) in enumerate(train_loader):
        print(idx)
        imgs = imgs.to(DEVICE)
        captions = captions.to(DEVICE)
        
        outputs = model(imgs, captions[:-1])
        loss = criterion(outputs.reshape(-1, outputs.shape[2]), captions.reshape(-1))
        
        optimizer.zero_grad()
        loss.backward(loss)
        optimizer.step()
        
        total_loss += loss.item()
        step+=1
        
        print(f'Epoch: {epoch}, Step: {idx}, Loss: {total_loss/step}')
        if (idx+1)%10 == 0:
            break
        
    
    print(f'Epoch: {epoch}, Step: {idx}, Loss: {total_loss/step}')
    break
        

0
Epoch: 0, Step: 0, Loss: 8.50615119934082
1
Epoch: 0, Step: 1, Loss: 8.496849060058594
2
Epoch: 0, Step: 2, Loss: 8.495614051818848
3
Epoch: 0, Step: 3, Loss: 8.480931282043457
4
Epoch: 0, Step: 4, Loss: 8.471000671386719
5
Epoch: 0, Step: 5, Loss: 8.45645840962728
6
Epoch: 0, Step: 6, Loss: 8.43936402457101
7
Epoch: 0, Step: 7, Loss: 8.42539393901825
8
Epoch: 0, Step: 8, Loss: 8.409725189208984
9
Epoch: 0, Step: 9, Loss: 8.392852210998536
Epoch: 0, Step: 9, Loss: 8.392852210998536
