In [31]:
import numpy as np
import pandas as pd
import json
import re
from collections import defaultdict

In [32]:
file=r"C:\Users\PIYUSH\Desktop\v2_mscoco_train2014_annotations.json"

In [33]:
with open(file,'r') as myfile:
    data=myfile.read()

In [34]:
data=json.loads(data)['annotations']

In [22]:
def make_vocab_answers(annotations, n_answers):
    """Make dictionary for top n answers and save them into text file."""
    answers = defaultdict(lambda: 0)
    for annotation in annotations:
            for answer in annotation['answers']:
                word = answer['answer']
                if re.search(r"[^\w\s]", word):
                    continue
                answers[word] += 1
                
    answers = sorted(answers, key=answers.get, reverse=True)
    assert('<unk>' not in answers)
    top_answers = ['<unk>'] + answers[:n_answers-1] # '-1' is due to '<unk>'
    
    with open('vocab_answers.txt', 'w') as f:
        f.writelines([w+'\n' for w in top_answers])

    print('Make vocabulary for answers')
    print('The number of total words of answers: %d' % len(answers))
    print('Keep top %d answers into vocab' % n_answers)

In [23]:
make_vocab_answers(data,50## this saves a text file in the same folder)

Make vocabulary for answers
The number of total words of answers: 135203
Keep top 50 answers into vocab


In [29]:
SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)')


def tokenize(sentence):
    tokens = SENTENCE_SPLIT_REGEX.split(sentence.lower())
    tokens = [t.strip() for t in tokens if len(t.strip()) > 0]
    return tokens
def load_str_list(fname):
    with open(fname) as f:
        lines = f.readlines()
    lines = [l.strip() for l in lines]
    return lines


class VocabDict:

    def __init__(self, vocab_file):
        self.word_list = load_str_list(vocab_file)
        self.word2idx_dict = {w:n_w for n_w, w in enumerate(self.word_list)}
        self.vocab_size = len(self.word_list)
        self.unk2idx = self.word2idx_dict['<unk>'] if '<unk>' in self.word2idx_dict else None
    def idx2word(self, n_w):

        return self.word_list[n_w]

    def word2idx(self, w):
        if w in self.word2idx_dict:
            return self.word2idx_dict[w]
        elif self.unk2idx is not None:
            return self.unk2idx
        else:
            raise ValueError('word %s not in dictionary (while dictionary does not contain <unk>)' % w)

    def tokenize_and_index(self, sentence):
        inds = [self.word2idx(w) for w in tokenize(sentence)]

        return inds
        

In [30]:
ans_vocab = VocabDict('vocab_answers.txt')
ans_vocab.word2idx_dict

{'<unk>': 0,
 'no': 1,
 'yes': 2,
 '2': 3,
 '1': 4,
 'white': 5,
 '3': 6,
 'red': 7,
 'black': 8,
 'blue': 9,
 '0': 10,
 '4': 11,
 'green': 12,
 'brown': 13,
 'yellow': 14,
 '5': 15,
 'gray': 16,
 '6': 17,
 'baseball': 18,
 'nothing': 19,
 'frisbee': 20,
 'tennis': 21,
 'right': 22,
 'left': 23,
 'orange': 24,
 'wood': 25,
 'bathroom': 26,
 'pizza': 27,
 'none': 28,
 'pink': 29,
 'kitchen': 30,
 '7': 31,
 '8': 32,
 'cat': 33,
 'dog': 34,
 'skiing': 35,
 'grass': 36,
 'water': 37,
 'man': 38,
 'skateboarding': 39,
 'silver': 40,
 '10': 41,
 'kite': 42,
 'horse': 43,
 'black and white': 44,
 'skateboard': 45,
 'surfing': 46,
 'snow': 47,
 'giraffe': 48,
 'tan': 49}

<__main__.VocabDict at 0x29109c19128>