In [10]:
import json

In [3]:
corpus_path = '../../Dataset/monolingual.hi'

In [4]:
with open(corpus_path, 'r') as file:
    data = file.read()[:10000000]

In [19]:
class Data:
    def __init__(self, context_size):
        
        self.context_size = context_size
        
        self.word2idx = {}  # word:[id, frequency]
        self.idx2word = {}  # id:word mapping
        self.corpus_idx = []  # represent words in corpus as numbers,store unique index for each word in corpus
        self.voc_size = 1
        
    def extract_unique_words(self, lines, threshold=10):
        for line in lines:
            for word in line.split():
                try:
                    self.corpus_idx.append(self.word2idx[word][0])  # store corr id already saved in word2idx
                    self.word2idx[word][1] += 1  # increase the word frequency

                except KeyError:
                    self.word2idx[word] = [self.voc_size, 1]
                    self.corpus_idx.append(self.word2idx[word][0])
                    # first index of the value's list stores the word 2 index mapping and 2nd stores the frequency
                    self.voc_size += 1

        for word in list(self.word2idx.keys()):
            # if word frequency is above threshold, replace the word key's value with its id
            if self.word2idx[word][1] >= threshold:
                self.word2idx[word] = self.word2idx[word][0]
            else:
                del self.word2idx[word] # delete infrequent items

        self.idx2word = {idx:word for word,idx in self.word2idx.items()}
        
        # in corpus_idx, only keep those indices which are present in idx2word dict(indices corr to frequent words)
        self.corpus_idx = [ind for ind in self.corpus_idx if ind in self.idx2word]
        self.voc_size = len(self.word2idx)
        print("vocabulary size : {}, truncated corpus size : {}".format(
            self.voc_size,
            len(self.corpus_idx),
        ))
        
        with open("words_in_vocab.txt", 'w') as file:
            file.write(str(self.voc_size))
            for word in self.word2idx.keys():
                file.write(word)
                file.write('\n') 
        
        with open('word2idx.json', 'w') as file1, open('idx2word.json', 'w') as file2:
            json.dump(self.word2idx, file1)
            json.dump(self.idx2word, file2)
        
    def generate_trg_data(self):
        corpus_size = len(self.corpus_idx)
        with open('training_data.txt', 'w') as file:
            for center_word_pos in range(corpus_size):
                for context_pos in range(-self.context_size, self.context_size+1):
                    context_word_pos = center_word_pos + context_pos 
                    if context_word_pos >= 0 and\
                    context_word_pos < corpus_size and\
                    context_pos != 0:
#                         print("positions : ", center_word_pos, center_word_pos + context_pos)
                        pair = str(self.corpus_idx[center_word_pos]) + '\t' +\
                        str(self.corpus_idx[context_word_pos]) + '\n'
                        file.write(pair)

In [7]:
lines = data.split('\n')

In [17]:
context_size = 2

In [20]:
data = Data(context_size)
data.extract_unique_words(lines)
data.generate_trg_data()

vocabulary size : 12557, truncated corpus size : 1575620
