# We want to convert text -> numerical values

The Goals are:
* Create Vocabulary mapping each word to an index
* We need to setup PyTorch dataset to load the data
* Setup padding of every padding (all example should be of the same sequence length)

In [1]:
import re
import pathlib
import numpy as np
from pathlib import Path
from MSCOCO import COCO

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
root = Path('../../Datasets/MS COCO')
imgs_path = root/'train2017'
captions_path = root/'annotations_trainval2017'/'annotations'/'captions_train2017.json'

In [3]:
coco = COCO(imgs_path, captions_path)

In [4]:
len(coco.imgs)

118287

In [6]:
coco.imgs_caps_dict['000000000009.jpg']

['Closeup of bins of food that include broccoli and bread.',
 'A meal is presented in brightly colored plastic trays.',
 'there are containers filled with different kinds of foods',
 'Colorful dishes holding meat, vegetables, fruit, and bread.',
 'A bunch of trays that have different food.']

In [None]:
coco.imgs_caps_dict

In [None]:
class Vocabulary:
    
    # Constructor
    def __init__(self, freq_threshold, stop_words):
        """
        Constructor to create vocabulary of words and to tokenize them.
        :param freq_threshold (int): if a word is not repeated enough don't add it to the dictionary
        :param stop_words (list): to remove stopping words
        """
        self.idx_to_string = {
            0: '<PAD>', # to pad all the captions to be the same size
            1: '<SOS>', # Start of sentence
            2: '<EOS>', # End of sentence
            3: '<UNK>', # Unknown Token
        }
        self.string_to_index = {
            '<PAD>': 0,
            '<SOS>': 1,
            '<EOS>': 2,
            '<UNK>': 3,
        }
        self.freq_threshold = freq_threshold
        
        
    # return the length of the vocabulary
    def __len__(self):
        return len(self.idx_to_string)
    
    # static method we can use it without create an instance from the class
    # tokenize a pieace of text
    @staticmethod
    def tokenizer_eng(text):
        return [tok.text]