In [1]:
import re
import os
import torch


PAD_TOKEN = '[PAD]' # This has a vocab id, which is used to pad the encoder input, decoder input and target sequence
UNKNOWN_TOKEN = '[UNK]' # This has a vocab id, which is used to represent out-of-vocabulary words
START_DECODING = '[START]' # This has a vocab id, which is used at the start of every decoder input sequence
STOP_DECODING = '[STOP]' # This has a vocab id, which is used at the end of untruncated target sequences

special_tokens  = [UNKNOWN_TOKEN, PAD_TOKEN, START_DECODING, STOP_DECODING]

class Vocab:
    def __init__(self, vocab_file, vocab_size):
        self._word2id, self._id2word = make_vocab(vocab_file, vocab_size)
#         with open(vocab_file, 'rb') as f:
#             wc = pkl.load(f)
#         self._word2id, self._id2word = make_vocab(wc, vocab_size)

    def word2id(self, word):
        return self._word2id[UNKNOWN_TOKEN] if word not in self._word2id else self._word2id[word]

    def id2word(self, idx):
        return UNKNOWN_TOKEN if idx >= self.size else self._id2word[idx]

    @property
    def size(self):
        return len(self._word2id)

def make_vocab(vocab_file, vocab_size):
    # Read the vocab file and add words up to max_size
    word2id, id2word = {}, {}
    for i, t in enumerate(special_tokens):
        word2id[t], id2word[i] = i, t

    with open(vocab_file, 'r') as vocab_f:
        for i,line in enumerate(vocab_f):
            pieces = line.split()
            if len(pieces) != 2:
                # print ('Warning: incorrectly formatted line in vocabulary file: %s\n' % line)
                continue
            w = pieces[0]
            word2id[w], id2word[i+4] = (i+4), w
            if len(word2id) == vocab_size: break
    return word2id, id2word               

def article2ids(words, vocab):
    ids = []
    oovs = []
    for w in words:
        i = vocab.word2id(w)
        if i == UNK:
            if w not in oovs:
                oovs.append(w)
            ids.append(vocab.size + oovs.index(w))
        else: ids.append(i)

    return ids, oovs

def abstract2ids(words, vocab, article_oovs):
    ids = []
    for w in words:
        i = vocab.word2id(w)
        if i == UNK:
            if w in article_oovs:
                ids.append(vocab.size + article_oovs.index(w))
            else: ids.append(UNK)
        else: ids.append(i)
    return ids

def output2words(ids, vocab, art_oovs):
    words = []
    for i in ids:
        w = vocab.id2word(i) if i < vocab.size else art_oovs[i - vocab.size]
        words.append(w)
    return words

def show_art_oovs(article, vocab):
    words = article.split(' ')
    words = [("__%s__" % w) if vocab.word2id(w)==UNK else w for w in words]
    out_str = ' '.join(words)
    return out_str

In [2]:
vocab = Vocab('/home/eagleuser/Desktop/leyan/Train-Data/Cameras_new8/Embedding/word2Vec/word.vocab', 
              60000)

In [6]:
vocab._word2id

{'[UNK]': 0,
 '[PAD]': 1,
 '[START]': 2,
 '[STOP]': 3,
 'the': 4,
 '.': 5,
 'and': 6,
 'camera': 7,
 'for': 8,
 'this': 9,
 'with': 10,
 'that': 11,
 'you': 12,
 'have': 13,
 'but': 14,
 'not': 15,
 'use': 16,
 'was': 17,
 'take': 18,
 'are': 19,
 'picture': 20,
 'get': 21,
 'great': 22,
 'can': 23,
 'very': 24,
 'one': 25,
 'good': 26,
 'all': 27,
 'out': 28,
 'when': 29,
 'quality': 30,
 'like': 31,
 'will': 32,
 'from': 33,
 'has': 34,
 'work': 35,
 'just': 36,
 'would': 37,
 'video': 38,
 'more': 39,
 'battery': 40,
 'buy': 41,
 'lens': 42,
 'your': 43,
 'about': 44,
 'time': 45,
 'than': 46,
 'make': 47,
 'photo': 48,
 'had': 49,
 'they': 50,
 'only': 51,
 'want': 52,
 'look': 53,
 'other': 54,
 'there': 55,
 'also': 56,
 'some': 57,
 'image': 58,
 'it': 59,
 'shoot': 60,
 'well': 61,
 'canon': 62,
 'which': 63,
 'need': 64,
 'what': 65,
 'zoom': 66,
 'even': 67,
 'really': 68,
 'feature': 69,
 'come': 70,
 'better': 71,
 'much': 72,
 'light': 73,
 'easy': 74,
 'bag': 75,
 'after'