In [1]:
class BuildPadmaIndex:
    
    def __init__(self, 
                 read_from_file=None,
                 tokens_path=None,
                 max_files=False):
    
        '''Build Padma index either from previously created index 
        stored in a file or by creating a new index from texts.
            
        read_from_file | bool | either None or the prefix of the filename used when
        max_files | int | to make debugging faster
        '''
    
        if read_from_file is None and tokens_path is None:
            raise(ValueError('read_from_file and tokens_path can not both be None'))
    
        self._tokens_path = tokens_path
    
        # for debugging cases
        self._max_files = max_files
        
        # filename is provided as string
        if isinstance(read_from_file, str):
            self._null = self._read_from_file(read_from_file)
        
        # default input value None is provided
        elif read_from_file is None:
            self._null = self._generate_text_index()
        
        # something else is provided
        else:
            raise(ValueError('read_from_file` must be either None or a string that points to a file name.'))
        
    def _read_from_file(self, name):
        
        '''For the case where index is already created previously.'''
        
        import pickle
     
        with open(name + '-main_index.pkl', 'rb') as f:
            self.final_index = pickle.load(f)
            
        with open(name + '-id_to_file.pkl', 'rb') as f:
            self.id_to_file = pickle.load(f)
        
    def _generate_text_index(self):

        '''For the case where new index is to be created.'''
        
        from tqdm import tqdm
        import glob
        import gzip
        import os
        import re
        
        # get names of files for tokens
        files = glob.iglob(self._tokens_path + '**/*.*', recursive=True)
        files = list(files)
    
        # limit based on max files
        if self._max_files is not False:
            files = files[:self._max_files]

        # decompress and rename if gz
        for i, filename in enumerate(files): 
            if filename.endswith('.gz'):
                os.system('gzip -d ' + filename)
            files[i] = re.sub('\.gz$', '', filename)        
            
        # read tokens into memory
        tokens = {}
        for file in files:

            try:
                tokens[file] = open(file, 'r').read().split()
            except AttributeError:
                tokens[file] = []

        # create list of all unique tokens
        out = []
        for file in files:
            temp_tokens = tokens[file]
            out += temp_tokens
        word_list = list(set(out))

        # create file-to-id indexes
        self.file_to_id = {}
        self.id_to_file = {}
        for i, file in enumerate(files):
            self.file_to_id[file] = i
            self.id_to_file[i] = file

        # put everything together
        self.final_index = {}
        self.word_set = set(word_list)

        # create key values
        for word in self.word_set:
            self.final_index[word] = {}

        # create values
        for file in tqdm(files):  
            text_set = set(tokens[file])
            
            for word in self.word_set.intersection(text_set):
                
                self.final_index[word][self.file_to_id[file]] = []
            
                #locations = list(filter(lambda x: tokens[file][x] == word, range(len(tokens[file]))))
                
                ## experimental ##
                
                locations = []
                
                for i, fragment in enumerate(''.join(tokens[file]).split('_')):
                    if word in fragment:
                        locations += [i]
                
                ## experimental ends) ##
                
                self.final_index[word][self.file_to_id[file]] += locations
                      
    def word_to_text(self, word):
        
        out = []
        
        for text_id in self.final_index[word].keys():
    
            file = self.id_to_file[text_id]
            text = open(file, 'r').read()
            
            out.append([file, text])
            
        return out
    
    def word_to_location(self, word):
        
        out = []
        
        for text_id in self.final_index[word].keys():
            
            location = [[text_id, i] for i in self.final_index[word][text_id]]
            
            out.append(location)
            
        return out
    
    def save_to_file(self, name):
        
        import pickle
        from sqlitedict import SqliteDict
        
        index = SqliteDict(name + '-main_index.sqlite', autocommit=True)
        for key in temp_index.final_index.keys():
            index[key] = temp_index.final_index[key]
        
        with open(name + '-id_to_file.pkl', 'wb') as f:
            pickle.dump(self.id_to_file, f, pickle.HIGHEST_PROTOCOL)

In [None]:
index = BuildPadmaIndex(tokens_path='/Users/upstairs/dev/tokens/')
#index.save_to_file('Padma-Index')
#temp_index = BuildPadmaIndex(read_from_file='Padma-Index')

 50%|███████████████████████████████████████████████████████████                                                           | 7/14 [01:16<01:03,  9.10s/it]

In [None]:
index.word_to_text('སླེབ')[:10]

In [4]:
import pickle

with open('/Users/upstairs/dev/Padma-Backend/app/data/title_info.pkl', 'rb') as f:
    titles = pickle.load(f)

In [9]:
titles['Terdzo-KA-007']

{'title': 'ངོ་མཚར་སྤྲུལ་པའི་སྐུ་མཆོག་རིས་མེད་གཏེར་སྟོན་རིམ་པར་བྱོན་པ་རྣམས་ཀྱི་གསོལ་འདེབས་རྒྱས་པར་བཀོད་པ་མོས་གུས་རྒྱ་མཚོའི་རླབས་ཕྲེང་',
 'Author': 'འཇམ་མགོན་ཀོང་སྤྲུལ་',
 'Terdzö Category': 'Lineage Histories',
 'Tibetan Colophon(s)': 'ཅེས་རྒྱལ་དབང་པདྨཱ་ཀ་རའི་ཕྲིན་ལས་ཀྱི་གཙོ་བོ་རྡོ་རྗེའི་གསུང་གི་གསང་བ་ཟབ་མོ་རིན་ཆེན་གཏེར་གྱི་མཛོད་ཆེན་པོ་རྩོལ་བས་སྒྲུབ་པའི་སྐབས་སུ། གཏེར་འབྱུང་གི་རིགས་དང་སོ་སོའི་རྣམ་ཐར་ལས་བྱུང་བ་དང་། གཙོ་བོ་ཁྱབ་བདག་མཚོ་སྐྱེས་རྡོ་རྗེ་དང་ཞལ་མི་གཉིས་པ་ཀུན་མཁྱེན་བླ་མ་རྡོ་རྗེ་གཟི་བརྗིད་ཀྱི་ཞལ་ལུང་ལ་གཞིར་བྱས། །བྱང་བདག་གིས་མཛད་པའི་མཚན་ཡོངས་བསྡམས། བདུད་འདུལ་དང་གནམ་ཆོས་སོགས་ཀྱིས་མཛད་པའི་གཏེར་བརྒྱའི་གསོལ་འདེབས་ལས་མཚན་ཐོ་བྱུང་བས་ཀྱང་ཁ་བསྐངས་ཏེ། པདྨའི་སྔགས་རིག་འཛིན་པ་པདྨ་གར་དབང་ཕྲིན་ལས་འགྲོ་འདུལ་རྩལ་གྱིས་དཔལ་དེ་བཱི་ཀོ་ཊཱིའི་དབེན་ཁྲོད་ཙ་འདྲ་རིན་ཆེན་བྲག་གི་སྒྲུབ་གནས་ཀུན་བཟང་བདེ་ཆེན་འོད་གསལ་གླིང་དུ་སྦྱར་བ་དགེ་ལེགས་འཕེལ།།  །།',
 'Links': 'No TBRC link // No THL link'}

In [31]:
import pandas as pd

path = 'https://raw.githubusercontent.com/OpenPecha/catalog/master/data/catalog.csv'
openpecha_titles = pd.read_csv(path, error_bad_lines=False)[:4325]

def get_id(s):
    
    s = s.split(']')[0]
    s = s.replace('[', '')
    
    return s
    
openpecha_titles['ID'] = openpecha_titles.ID.apply(get_id)

b'Skipping line 1736: expected 5 fields, saw 7\n'


In [None]:
        with open(name + '-title-info.pkl', 'wb') as f:
            pickle.dump(self.id_to_file, f, pickle.HIGHEST_PROTOCOL)
    

In [29]:
openpecha_titles

Unnamed: 0,ID,Title,Volume,Author,Source ID
0,P000001,,,,
1,P000002,,,,
2,P000003,,,,bdr:W22083
3,P000004,བཀའ་འགྱུར། ༼ཧེ་མིས་བྲིས་མ།༽,,,bdr:W2KG210298
4,P000005,བཀའ་འགྱུར། ༼ཕུག་བྲག་བྲིས་མ༽,,,bdr:W2KG210295
...,...,...,...,...,...
4320,P010800,སྟག་འབུམ་རྒྱལ་གྱི་སྒྲུང་གཏམ་ལ་དཔྱད་པ།,,,bdr:W1KG25503
4321,P010801,ཤེར་ཕྱིན་བརྒྱད་སྟོང་པའི་བཤད་པ་མངོན་རྟོགས་རྒྱན་...,,,bdr:W3CN4869
4322,P010802,དྲིན་ལན་བསབ་པའི་མདོ།,,,bdr:W3CN8173
4323,P010803,དཔལ་ཀྱེ་རྡོ་རྗེའི་རྒྱུད་ཀྱི་བཤད་པ་བཀའི་བརྒྱ་པ་...,,,bdr:W29902


In [24]:
import glob

files = glob.iglob('/Users/upstairs/dev/Padma-Tokens/tokens/' + '**/*.*', recursive=True)
files = list(files)

In [27]:
files[1000]

'/Users/upstairs/dev/Padma-Tokens/tokens/P008105/v036.txt.gz'

In [4]:
from sqlitedict import SqliteDict
index = SqliteDict('index.sqlite', autocommit=True)
for key in temp_index.final_index.keys():
    index[key] = temp_index.final_index[key]

In [65]:
loaded_index = BuildPadmaIndex(read_from_file='Padma')

In [None]:
index.

In [71]:
_get_obj_size(_read_from_file('Padma-main_index.pkl'))

3369140

In [61]:
_read_from_file('Padma-id_to_file.pkl')

{0: 'Terdzo-TI-046-1.txt',
 1: 'Terdzo-PHI-063.txt',
 2: 'Terdzo-BI-033.txt',
 3: 'Terdzo-ZHI-038.txt',
 4: 'Terdzo-TSA-013.txt',
 5: 'Terdzo-CI-027.txt',
 6: 'Terdzo-BI-027.txt',
 7: 'Terdzo-TSA-007.txt',
 8: 'Terdzo-PHI-077.txt',
 9: 'Terdzo-ZHI-004.txt'}

In [47]:
def _get_obj_size(obj):

    import gc
    import sys

    marked = {id(obj)}
    obj_q = [obj]
    sz = 0

    while obj_q:
        sz += sum(map(sys.getsizeof, obj_q))

        # Lookup all the object referred to by the object in obj_q.
        # See: https://docs.python.org/3.7/library/gc.html#gc.get_referents
        all_refr = ((id(o), o) for o in gc.get_referents(*obj_q))

        # Filter object that are already marked.
        # Using dict notation will prevent repeated objects.
        new_refr = {o_id: o for o_id, o in all_refr if o_id not in marked and not isinstance(o, type)}

        # The new obj_q will be the ones that were not marked,
        # and we will update marked with their ids so we will
        # not traverse them again.
        obj_q = new_refr.values()
        marked.update(new_refr.keys())

    return sz