In [1]:
import numpy as np
from collections import Counter, OrderedDict

In [2]:
class LangVocab:
    def __init__(self, lang_name):
        
        self.name = lang_name
        self.token_To_index = {'_sos': 0, '_eos': 1, '_unk': 2, '_pad': 3}
        self.index_To_token = {0: '_sos', 1: '_eos', 2: '_unk', 3: '_pad'}
        self.token_count = {}
        self.num_token = 4
        self.max_length = 0
    
    # Add word into language vocabulary
    def word_To_vocab(self, word):
        if word not in self.token_To_index:
            self.token_To_index[word] = self.num_token
            self.index_To_token[self.num_token] = word
            self.token_count[word] = 1
            self.num_token += 1
        else:
            self.token_count[word] += 1
    
    # Divide sentence into word
    def sentence_To_word(self, sentence):
        length = 0
        for word in sentence.split(' '):
            self.word_To_vocab(word)
            length += 1
        self.sentence_length(length)
    
    # Find the Max length sentence of a language
    def sentence_length(self, length):
        if self.max_length < length:
            self.max_length = length + 1
    
    # Convert a given sentence into it's corresponding indices vector.
    # If padding TRUE then convert small sentence length into max_length
    # If word is not present in dictionary the replace it with unknown word token
    def sentence_To_vector(self, sentence, padding = False, *max_length):
        word_indices = [self.token_To_index['_unk'] if word not in self.token_To_index else self.token_To_index[word] for word in sentence.split(' ')] + [self.token_To_index['_eos']]
        if padding == False:
            return word_indices
        else:
            word_indices = self.padding_sentence(word_indices, *max_length)
            return word_indices
        
    # Add pad to convert sentence into max_length size
    def padding_sentence(self, word_indices, max_length):
        if max_length > len(word_indices):
            word_indices = word_indices + [self.token_To_index['_pad'] for _ in range(max_length - len(word_indices))]
        return word_indices
    
    # Choose the most common words
    def most_common_words(self, num_words):
        self.token_count = Counter(self.token_count).most_common(num_words)
        self.token_To_index = {'_sos': 0, '_eos': 1, '_unk': 2, '_pad': 3}
        self.index_To_token = {0: '_sos', 1: '_eos', 2: '_unk', 3: '_pad'}
        self.num_token = 4
        for i in range(len(self.token_count)):
            self.token_To_index[self.token_count[i][0]] = self.num_token
            self.index_To_token[self.num_token] = self.token_count[i][0]
            self.num_token += 1
    
    # sort the token_To_index and index_To_token dict
    def sort_dict(self):
        
        # sorted dict based on keys
        sorted_dict = OrderedDict(sorted(self.token_count.items(), key=lambda t: t[0]))
        self.token_To_index = {'_sos': 0, '_eos': 1, '_unk': 2, '_pad': 3}
        self.index_To_token = {0: '_sos', 1: '_eos', 2: '_unk', 3: '_pad'}
        self.num_token = 4
        for key, _ in sorted_dict.items():
            self.token_To_index[key] = self.num_token
            self.index_To_token[self.num_token] = key
            self.num_token += 1