# We want to convert text -> numerical values

The Goals are:
* Create Vocabulary mapping each word to an index
* We need to setup PyTorch dataset to load the data
* Setup padding of every padding (all example should be of the same sequence length)

In [1]:
import re
import pathlib
import numpy as np
from pathlib import Path
from MSCOCO import COCO

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
root = Path('../../Datasets/MS COCO')
imgs_path = root/'train2017'
captions_path = root/'annotations_trainval2017'/'annotations'/'captions_train2017.json'

In [3]:
coco = COCO(imgs_path, captions_path)

In [4]:
len(coco.imgs)

118287

In [5]:
coco.imgs_caps_dict['000000000009.jpg']

['Closeup of bins of food that include broccoli and bread.',
 'A meal is presented in brightly colored plastic trays.',
 'there are containers filled with different kinds of foods',
 'Colorful dishes holding meat, vegetables, fruit, and bread.',
 'A bunch of trays that have different food.']

In [29]:
import sys
sys.getsizeof(coco.imgs_caps_dict)

5242976

In [21]:
sentence = coco.imgs_caps_dict['000000000009.jpg'][0]
stop_words = stopwords.words('english')

In [18]:
sentence = re.sub('\W+',' ',sentence.lower())  # Remove all special characters, punctuation and spaces
sentence = word_tokenize(sentence)
sentence = [w for w in sentence if w not in stop_words]  # remove stop word

In [19]:
sentence

['closeup', 'bins', 'food', 'include', 'broccoli', 'bread']

In [135]:
class Vocabulary:
    
    # Constructor
    def __init__(self, freq_threshold, sequence_length=6):
        """
        Constructor to create vocabulary of words and to tokenize them.
        :param freq_threshold (int): if a word is not repeated enough don't add it to the dictionary
        :param stop_words (list): to remove stopping words
        """
        self.idx_to_string = {
            0: '<PAD>', # to pad all the captions to be the same size
            1: '<SOS>', # Start of sentence
            2: '<EOS>', # End of sentence
            3: '<UNK>', # Unknown Token
        }
        self.string_to_index = {
            '<PAD>': 0,
            '<SOS>': 1,
            '<EOS>': 2,
            '<UNK>': 3,
        }
        self.freq_threshold = freq_threshold
        self.sequence_length = sequence_length
        stop_words = stopwords.words('english') # stop words in english
        
        
    # return the length of the vocabulary
    def __len__(self):
        return len(self.idx_to_string)
    
    # static method we can use it without create an instance from the class
    # tokenize a pieace of text
    @staticmethod
    def tokenizer_eng(sentence):
        sentence = re.sub('\W+',' ',sentence.lower())  # Remove all special characters, punctuation and spaces
        tokenized_sentence = word_tokenize(sentence) # Tokenize the words
        tokenized_sentence = [w for w in tokenized_sentence if w not in stop_words]  # remove stop word

        return tokenized_sentence
    
    
    # medthod to build the vocabulary for us
    def build_vocabulary(self, sentence_list):
        frequencies = {}
        idx = len(self.idx_to_string)
        for sentence in sentence_list:
            for word in self.tokenizer_eng(sentence):
                if word not in frequencies:
                    frequencies[word] = 1
                else:
                    frequencies[word] += 1
                
                if frequencies[word] == self.freq_threshold:
                    self.string_to_index[word] = idx
                    self.idx_to_string[idx] = word
                    idx += 1
                    
    
    # method to convert words to numerical values
    def numericalize(self, sentence):
        tokenized_sentence = self.tokenizer_eng(sentence) # First tokenize the sentence
        # then convert the words to numerical idxs from our vocab
        idx_of_sentence = [self.string_to_index[word] if word in self.string_to_index else self.string_to_index['<UNK>'] for word in tokenized_sentence]
        
        if len(idx_of_sentence) >= self.sequence_length:
            idx_of_sentence = idx_of_sentence[:self.sequence_length]
        else:
            idx_of_sentence.extend([self.string_to_index['<PAD>']] * (self.sequence_length - len(idx_of_sentence)))
        return idx_of_sentence
    
    # method to convert idxs to words
    def convert_to_text(self, idxs):
        # convert idxs to words using idx_to_string
        sentence = [self.idx_to_string[idx] if idx in self.idx_to_string else '' for idx in idxs]
        # remove <PAD> if there is any
        sentence = list(filter(lambda word: word != '<PAD>', sentence))
        return sentence
     

In [136]:
vocab_class = Vocabulary(1, 10)

In [137]:
len(coco.imgs_caps_dict)

118287

In [138]:
vocab_class.build_vocabulary(coco.imgs_caps_dict['000000000009.jpg'])

In [139]:
vocab_class.idx_to_string

{0: '<PAD>',
 1: '<SOS>',
 2: '<EOS>',
 3: '<UNK>',
 4: 'closeup',
 5: 'bins',
 6: 'food',
 7: 'include',
 8: 'broccoli',
 9: 'bread',
 10: 'meal',
 11: 'presented',
 12: 'brightly',
 13: 'colored',
 14: 'plastic',
 15: 'trays',
 16: 'containers',
 17: 'filled',
 18: 'different',
 19: 'kinds',
 20: 'foods',
 21: 'colorful',
 22: 'dishes',
 23: 'holding',
 24: 'meat',
 25: 'vegetables',
 26: 'fruit',
 27: 'bunch'}

In [140]:
coco.imgs_caps_dict['000000000009.jpg'][0]

'Closeup of bins of food that include broccoli and bread.'

In [141]:
vocab_class.numericalize(coco.imgs_caps_dict['000000000009.jpg'][1])

[10, 11, 12, 13, 14, 15, 0, 0, 0, 0]

In [142]:
coco.imgs_caps_dict['000000000009.jpg'][1]

'A meal is presented in brightly colored plastic trays.'

In [143]:
idxs = vocab_class.numericalize(coco.imgs_caps_dict['000000000009.jpg'][1])
print(vocab_class.convert_to_text(idxs))

['meal', 'presented', 'brightly', 'colored', 'plastic', 'trays']
