### Building Custom Dataset for Flickr

**Date:** 1/11/2021  
**Author:** Murad Popattia

![image.png](attachment:41a1b5ab-daba-4e5c-9399-5bff63a29831.png)

In [152]:
# import libraries
import torch
import torch.nn as nn # all the layers
import torch.optim as optim # all the optimization algos
import torch.nn.functional as F # all the activation functions
from torch.utils.data import DataLoader, Dataset # helps to create mini-batches etc.
import torchvision # for importing models
import torchvision.transforms as transforms # for transformations on the dataset
from torchvision import utils
import matplotlib.pyplot as plt
import numpy as np
import os
import spacy
import tensorflow as tf
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"

- We want to convert text -> numerical values
- We ned a Vocabulary mapping each word to a index
- We need to setup a PyTorch dataset to load the data
- Setup padding of every batch (all examples should be of some seq_len and setup dataloader

In [138]:
# Download with: python -m spacy download en
spacy_eng = spacy.load('en_core_web_sm') # only 'en' is depricated

In [230]:
class Vocabulary:
    # freq_threhsold is to drop for words occuring less than a threshold
    def __init__(self, freq_threshold):
        
        self.freq_threshold = freq_threshold
        
        # defining dicts for these
        self.itos = {0: "<p>", 1:"<s>", 2:"<e>", 3:"<unk>"}
        self.stoi = {"<p>": 0, "<s>":1 , "<e>":2, "<unk>":3}
        
    def __len__(self):
        return len(self.itos)
    
    @staticmethod
    def tokenizer_engine(text):
        
        """
        Sample input: I love computer science
        Sample output: ['i', 'love', 'computer', 'science']
        """
        
        # tokenize using the spacy engine, which does more complex stuff of catering to 's etc.
        return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]
    
    def build_vocabulary(self, sent_list):
        freq = {}
        idx = 4 # this is because we have used till 3 in the previous dict
        
        for sent in sent_list:
            for word in self.tokenizer_engine(sent):
                if word not in freq:
                    freq[word] = 1
                else:
                    freq[word] += 1
                
                # in case it matching the freq
                if freq[word] == self.freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    
                    idx += 1
    
    # convert text to numerical values
    def numericalize(self, text):
        tokenized_text = self.tokenizer_engine(text)
        
        return[
            self.stoi[token] if token in self.stoi else self.stoi['<unk>']
            for token in tokenized_text
        ]
    
    # convert tokens to words
    def seq_to_text(self, seq):
        if type(seq[0]) == type(1):
            return [
                self.itos[token] if token in self.itos else self.itos[3]
                for token in seq
            ]
        else:
            return [
                [self.itos[token] if token in self.itos else self.itos[3]
                 for token in s
                ]for s in seq
            ]

In [140]:
class FlickrDataset(Dataset): # inherit from Datset
    def __init__(self, root_dir, caps_file, transform=None, freq_threshold=5):
        self.root_dir = root_dir
        self.df = pd.read_csv(caps_file)
        self.transform = transform
        
        # get image, captions cols
        self.imgs = self.df["image"]
        self.captions = self.df["caption"]
        
        # initialize vocab and build it
        self.vocab = Vocabulary(freq_threshold)
        self.vocab.build_vocabulary(self.captions.to_list())
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        caption = self.captions[index]
        img_id = self.imgs[index]
        
        img = Image.open(os.path.join(self.root_dir, img_id)).convert("RGB")
        
        # transforming the image if possible
        if self.transform is not None:
            img = self.transform(img)
            
        numericalized_caption = [self.vocab.stoi["<s>"]]
        numericalized_caption = self.vocab.numericalize(caption)
        numericalized_caption.append(self.vocab.stoi["<e>"])
        
        return img, torch.tensor(numericalized_caption)

### Checking the Vocabulary class

In [225]:
sent = "My name is murad"
sent1 = "My name is mansoor"
sent2 = "My mother's name is nilofer"

sent_list = [sent, sent1, sent2]

vocab = Vocabulary(2)
vocab.build_vocabulary(sent_list)

print(vocab.stoi)

print(vocab.numericalize(sent))
print(vocab.numericalize(sent1))

sent_num = vocab.numericalize(sent)
print(vocab.seq_to_text(sent_num))

sent_num = vocab.numericalize(sent1)
print(vocab.seq_to_text(sent_num))

print(vocab.seq_to_text([vocab.numericalize(sent), vocab.numericalize(sent1)]))

{'<p>': 0, '<s>': 1, '<e>': 2, '<unk>': 3, 'my': 4, 'name': 5, 'is': 6}
[4, 5, 6, 3]
[4, 5, 6, 3]
['my', 'name', 'is', '<unk>']
['my', 'name', 'is', '<unk>']
[['my', 'name', 'is', '<unk>'], ['my', 'name', 'is', '<unk>']]


We can also use keras text preprocessing for the same task

### Making use of keras.preprocessing.text

In [142]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [143]:
train_data = [
  "I enjoy coffee.",
  "I enjoy tea.",
  "I dislike milk.",
  "I am going to the supermarket later this morning for some coffee."
]

test_data = [
  "Enjoy coffee this morning.",
  "I enjoy going to the supermarket.",
  "Want some milk for your coffee?"
]

tokenizer = Tokenizer(num_words = 1000, oov_token="<unk>")
tokenizer.fit_on_texts(train_data)

print(tokenizer.word_index)
print(tokenizer.word_counts)

{'<unk>': 1, 'i': 2, 'enjoy': 3, 'coffee': 4, 'tea': 5, 'dislike': 6, 'milk': 7, 'am': 8, 'going': 9, 'to': 10, 'the': 11, 'supermarket': 12, 'later': 13, 'this': 14, 'morning': 15, 'for': 16, 'some': 17}
OrderedDict([('i', 4), ('enjoy', 2), ('coffee', 2), ('tea', 1), ('dislike', 1), ('milk', 1), ('am', 1), ('going', 1), ('to', 1), ('the', 1), ('supermarket', 1), ('later', 1), ('this', 1), ('morning', 1), ('for', 1), ('some', 1)])


In [144]:
# Encode training data sentences into sequences
tokenizer.texts_to_sequences(train_data)

[[2, 3, 4], [2, 3, 5], [2, 6, 7], [2, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 4]]

In [145]:
# Pad the training sequences
train_padded = pad_sequences(train_sequences, padding="post", maxlen=15)
print(train_padded)

[[ 2  3  4  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 2  3  5  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 2  6  7  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 2  8  9 10 11 12 13 14 15 16 17  4  0  0  0]]


In [146]:
tokenizer.sequences_to_texts(train_padded)

['i enjoy coffee <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>',
 'i enjoy tea <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>',
 'i dislike milk <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>',
 'i am going to the supermarket later this morning for some coffee <unk> <unk> <unk>']

In [147]:
test_seq = tokenizer.texts_to_sequences(test_data)
tokenizer.sequences_to_texts(test_seq)

['enjoy coffee this morning',
 'i enjoy going to the supermarket',
 '<unk> some milk for <unk> coffee']

### Collating sequences (padding)

In [180]:
from torch.nn.utils.rnn import pad_sequence

class MyCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx
        
    def __call__(self, batch):
        imgs = [item[0].unsqueeze(0) for item in batch] # batch return img, caps. Moreover unsqueeze is done to include batch dim
        imgs = torch.cat(imgs, dim=0)
        
        # padding the text before sending
        targets = [item[1] for item in batch]
         
        # batch_first means that we want to set batch as first dimension or not
        targets = pad_sequence(targets, batch_first=True, padding_value=self.pad_idx)
        
        return imgs, targets

In [231]:
def get_loader(root_folder, annotation_file, transform, batch_size=32):
    dataset = FlickrDataset(root_folder, annotation_file, transform)
    pad_idx = dataset.vocab.stoi["<p>"]
    
    loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, collate_fn=MyCollate(pad_idx=pad_idx))
    
    return dataset, loader

In [232]:
transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor()
])

dataset, dataloader = get_loader('../datasets/flickr8k/Images/', annotation_file='../datasets/flickr8k/captions.txt', transform=transform)

### Validating the approach

In [250]:
for idx, (img, captions) in enumerate(dataloader):
    if (idx == 1):
        break
    print(img.shape)
    print(captions.shape)
    
    for cap in dataset.vocab.seq_to_text(captions.numpy()):
        print(cap)

torch.Size([32, 3, 224, 224])
torch.Size([32, 22])
['a', 'man', 'helping', 'a', 'boy', 'ride', 'a', 'bicycle', '.', '<e>', '<p>', '<p>', '<p>', '<p>', '<p>', '<p>', '<p>', '<p>', '<p>', '<p>', '<p>', '<p>']
['two', 'men', 'handle', 'a', 'grey', 'dog', 'with', 'a', 'blue', '<unk>', 'rope', 'in', 'its', 'mouth', '.', '<e>', '<p>', '<p>', '<p>', '<p>', '<p>', '<p>']
['a', 'bride', 'throwing', 'her', 'bouquet', 'up', 'in', 'the', 'air', 'behind', 'her', ',', 'towards', 'a', 'group', 'of', 'ladies', '.', '<e>', '<p>', '<p>', '<p>']
['an', 'old', 'woman', 'is', 'sitting', 'next', 'to', 'a', 'young', 'girl', 'with', 'her', 'fingers', 'in', 'her', 'mouth', '.', '<e>', '<p>', '<p>', '<p>', '<p>']
['people', 'are', 'gathering', 'around', 'a', 'table', 'of', 'food', 'and', 'outside', 'a', '<unk>', '<unk>', 'by', '.', '<e>', '<p>', '<p>', '<p>', '<p>', '<p>', '<p>']
['two', 'dogs', 'play', 'with', 'each', 'other', 'in', 'the', 'grass', '.', '<e>', '<p>', '<p>', '<p>', '<p>', '<p>', '<p>', '<p>', '