In [2]:
class BuildPadmaIndex:
    
    def __init__(self, read_from_file=None, max_files=False):
    
        '''
        read_from_file | str | either None or the prefix of the filename used when 
        '''
    
        self._max_files = max_files
        
        if isinstance(read_from_file, str):
            self._null = self._read_from_file(read_from_file)
        elif read_from_file is None:
            self._null = self._generate_text_index()
        else:
            raise(ValueError('read_from_file` must be either None or a string that points to a file name.'))
        
    def _read_from_file(self, name):
        
        import pickle
     
        with open(name + '-main_index.pkl', 'rb') as f:
            self.final_index = pickle.load(f)
            
        with open(name + '-id_to_file.pkl', 'rb') as f:
            self.id_to_file = pickle.load(f)
        
    def _generate_text_index(self):

        from tqdm import tqdm
        
        # get names of files for tokens
        import os
        files = os.listdir('/tmp/tokens/')
        
        if self._max_files is not False:
            files = files[:self._max_files]

        # read tokens into memory
        tokens = {}
        for file in files:

            try:
                tokens[file] = open('/tmp/tokens/' + file, 'r').read().split()
            except AttributeError:
                tokens[file] = []

        # create list of all unique tokens
        out = []
        for file in files:
            temp_tokens = tokens[file]
            out += temp_tokens
        word_list = list(set(out))

        # create file-to-id indexes
        self.file_to_id = {}
        self.id_to_file = {}
        for i, file in enumerate(files):
            self.file_to_id[file] = i
            self.id_to_file[i] = file

        # put everything together
        self.final_index = {}
        self.word_set = set(word_list)

        # create key values
        for word in self.word_set:
            self.final_index[word] = {}

        # create values
        for file in tqdm(files):  
            text_set = set(tokens[file])
            
            for word in self.word_set.intersection(text_set):
                
                self.final_index[word][self.file_to_id[file]] = []
            
                #locations = list(filter(lambda x: tokens[file][x] == word, range(len(tokens[file]))))
                
                ## experimental ##
                
                locations = []
                
                for i, fragment in enumerate(''.join(tokens[file]).split('_')):
                    if word in fragment:
                        locations += [i]
                
                ## experimental ends) ##
                
                self.final_index[word][self.file_to_id[file]] += locations
                      
    def word_to_text(self, word):
        
        out = []
        
        for text_id in self.final_index[word].keys():
    
            file = self.id_to_file[text_id]
            text = open('/tmp/tokens/' + file, 'r').read()
            
            out.append([file, text])
            
        return out
    
    def word_to_location(self, word):
        
        out = []
        
        for text_id in self.final_index[word].keys():
            
            location = [[text_id, i] for i in self.final_index[word][text_id]]
            
            out.append(location)
            
        return out
    
    def save_to_file(self, name):
        
        import pickle
        
        with open(name + '-main_index.pkl', 'wb') as f:
            pickle.dump(self.final_index, f, pickle.HIGHEST_PROTOCOL)
            
        with open(name + '-id_to_file.pkl', 'wb') as f:
            pickle.dump(self.id_to_file, f, pickle.HIGHEST_PROTOCOL)
            
#index = BuildPadmaIndex(read_from_file=None)
#index.save_to_file('Padma-Index')

temp_index = BuildPadmaIndex(read_from_file='Padma-Index')

In [4]:
from sqlitedict import SqliteDict
index = SqliteDict('index.sqlite', autocommit=True)
for key in temp_index.final_index.keys():
    index[key] = temp_index.final_index[key]

In [5]:
ls -lhtr

total 526368
-rw-r--r--   1 upstairs  staff   604B Sep 11 20:46 Dockerfile
-rw-r--r--   1 upstairs  staff   1.0K Sep 11 20:46 LICENSE
drwxr-xr-x  14 upstairs  staff   448B Sep 11 20:46 [34mdocs[m[m/
-rw-r--r--   1 upstairs  staff    86B Sep 11 20:46 server.py
drwxr-xr-x   5 upstairs  staff   160B Sep 11 20:46 [34mtests[m[m/
drwxr-xr-x   8 upstairs  staff   256B Sep 12 19:41 [34mapp[m[m/
-rw-r--r--   1 upstairs  staff    44B Sep 24 19:30 requirements.txt
-rw-r--r--   1 upstairs  staff   1.8K Sep 24 20:00 setup.py
-rw-r--r--   1 upstairs  staff   706B Sep 25 16:23 README.md
-rw-r--r--   1 upstairs  staff   116M Sep 28 23:13 Padma-Index-main_index.pkl
-rw-r--r--   1 upstairs  staff    71K Sep 28 23:13 Padma-Index-id_to_file.pkl
-rw-r--r--   1 upstairs  staff   297K Sep 28 23:24 Padma-Backend dev.ipynb
-rw-r--r--   1 upstairs  staff   123M Sep 28 23:26 index.sqlite


In [None]:
mydict['some_key'] = any_picklable_object
print mydict['some_key']  # prints the new value
for key, value in mydict.iteritems():
>>>     print key, value
>>> print len(mydict) # etc... all dict functions work
>>> mydict.close()

In [65]:
loaded_index = BuildPadmaIndex(read_from_file='Padma')

In [66]:
index.final_index

{'ལྗང་གུ་': {3: [761], 4: [5814]},
 'རྒྱ་མཚལ་': {3: [416]},
 'རྔམ་པ་': {8: [2412]},
 'ཐབས་': {4: [474, 2698, 7515],
  6: [196, 1120, 2293, 2657],
  8: [195, 4354],
  9: [703, 942]},
 'བདག་བསྐྱེད་': {3: [1256], 4: [2035, 2367, 3446], 5: [1762], 9: [155]},
 'སྨན་': {3: [737],
  4: [972, 1285, 6044, 6326],
  5: [59],
  8: [3590],
  9: [103, 1255, 4989]},
 'རྣམས': {2: [757],
  3: [3940],
  4: [3006],
  5: [1490, 1558, 1840, 2091, 2171],
  8: [736, 3210, 4491, 4754, 5523],
  9: [726, 2340, 2468, 4823]},
 'ཕག་': {4: [5630]},
 'བུམ་པ་': {0: [235, 484, 495], 3: [1617], 4: [6495], 8: [1185], 9: [4104]},
 'ཕྱག་རྡོར་': {3: [507, 1252], 4: [2580, 5151]},
 'ལམ་': {0: [101],
  2: [531],
  3: [80, 917, 2714],
  4: [1764,
   2732,
   3172,
   3274,
   3362,
   3406,
   3488,
   3496,
   3837,
   3960,
   4022,
   6638,
   6812,
   7692,
   7728,
   8966,
   9056,
   9871,
   9934,
   10719],
  6: [783],
  8: [31],
  9: [2166, 2465]},
 'ཞབས་བསིལ': {4: [1215]},
 'འབུམ': {9: [2746]},
 'འབུམ་': {3: [2374]

In [68]:
loaded_index.final_index

{'ལྗང་གུ་': {3: [761], 4: [5814]},
 'རྒྱ་མཚལ་': {3: [416]},
 'རྔམ་པ་': {8: [2412]},
 'ཐབས་': {4: [474, 2698, 7515],
  6: [196, 1120, 2293, 2657],
  8: [195, 4354],
  9: [703, 942]},
 'བདག་བསྐྱེད་': {3: [1256], 4: [2035, 2367, 3446], 5: [1762], 9: [155]},
 'སྨན་': {3: [737],
  4: [972, 1285, 6044, 6326],
  5: [59],
  8: [3590],
  9: [103, 1255, 4989]},
 'རྣམས': {2: [757],
  3: [3940],
  4: [3006],
  5: [1490, 1558, 1840, 2091, 2171],
  8: [736, 3210, 4491, 4754, 5523],
  9: [726, 2340, 2468, 4823]},
 'ཕག་': {4: [5630]},
 'བུམ་པ་': {0: [235, 484, 495], 3: [1617], 4: [6495], 8: [1185], 9: [4104]},
 'ཕྱག་རྡོར་': {3: [507, 1252], 4: [2580, 5151]},
 'ལམ་': {0: [101],
  2: [531],
  3: [80, 917, 2714],
  4: [1764,
   2732,
   3172,
   3274,
   3362,
   3406,
   3488,
   3496,
   3837,
   3960,
   4022,
   6638,
   6812,
   7692,
   7728,
   8966,
   9056,
   9871,
   9934,
   10719],
  6: [783],
  8: [31],
  9: [2166, 2465]},
 'ཞབས་བསིལ': {4: [1215]},
 'འབུམ': {9: [2746]},
 'འབུམ་': {3: [2374]

In [50]:
def _read_from_file(name):

    import pickle

    with open(name, 'rb') as f:
        return pickle.load(f)

In [71]:
_get_obj_size(_read_from_file('Padma-main_index.pkl'))

3369140

In [61]:
_read_from_file('Padma-id_to_file.pkl')

{0: 'Terdzo-TI-046-1.txt',
 1: 'Terdzo-PHI-063.txt',
 2: 'Terdzo-BI-033.txt',
 3: 'Terdzo-ZHI-038.txt',
 4: 'Terdzo-TSA-013.txt',
 5: 'Terdzo-CI-027.txt',
 6: 'Terdzo-BI-027.txt',
 7: 'Terdzo-TSA-007.txt',
 8: 'Terdzo-PHI-077.txt',
 9: 'Terdzo-ZHI-004.txt'}

In [47]:
def _get_obj_size(obj):

    import gc
    import sys

    marked = {id(obj)}
    obj_q = [obj]
    sz = 0

    while obj_q:
        sz += sum(map(sys.getsizeof, obj_q))

        # Lookup all the object referred to by the object in obj_q.
        # See: https://docs.python.org/3.7/library/gc.html#gc.get_referents
        all_refr = ((id(o), o) for o in gc.get_referents(*obj_q))

        # Filter object that are already marked.
        # Using dict notation will prevent repeated objects.
        new_refr = {o_id: o for o_id, o in all_refr if o_id not in marked and not isinstance(o, type)}

        # The new obj_q will be the ones that were not marked,
        # and we will update marked with their ids so we will
        # not traverse them again.
        obj_q = new_refr.values()
        marked.update(new_refr.keys())

    return sz

In [40]:
index = BuildPadmaIndex(read_from_file='/Users/upstairs/dev/Padma-Index/Padma-Index.pkl')

UnsupportedOperation: read

In [10]:
index.

{260: [6282],
 400: [6535],
 457: [22705],
 475: [2979],
 492: [2246, 2465],
 539: [419],
 555: [7429],
 609: [339, 387, 1899, 1952, 3061],
 690: [3262],
 831: [194],
 845: [2770],
 906: [3071],
 1038: [926],
 1055: [6073],
 1163: [860],
 1192: [1153],
 1311: [10592],
 1349: [110],
 1398: [4139],
 1416: [7561],
 1515: [711, 4147, 5554],
 1517: [8082, 15035, 18565],
 1541: [645],
 1606: [11073],
 1621: [1416,
  1478,
  5105,
  5217,
  5233,
  5332,
  6055,
  10285,
  12397,
  18910,
  27802,
  32302,
  32322,
  32388,
  32394,
  32534,
  32557,
  32596,
  34280,
  34572],
 1706: [370],
 1783: [767, 819, 883, 942, 987, 1856, 1903],
 1805: [5329],
 1836: [11063],
 1863: [5716, 6266],
 1870: [688],
 1949: [99, 451],
 1956: [973, 975],
 2002: [1715],
 2034: [8174],
 2081: [1137, 2773, 2849, 2927],
 2164: [475],
 2191: [3904, 4018, 6241],
 2271: [10881],
 2306: [4538, 4737, 4854],
 2312: [4465],
 2315: [1123, 1499, 1542, 3246, 4198, 4347, 4644, 4792],
 2459: [2302],
 2697: [2762, 2763],
 281

In [None]:
index.word_to_location('ཟུ་')

In [13]:
index.word_to_text('ཟུ་')

AttributeError: 'BuildPadmaIndex' object has no attribute 'id_to_file'

In [12]:
index.word_to_location('ཟུ་')

[[[260, 6282]],
 [[400, 6535]],
 [[457, 22705]],
 [[475, 2979]],
 [[492, 2246], [492, 2465]],
 [[539, 419]],
 [[555, 7429]],
 [[609, 339], [609, 387], [609, 1899], [609, 1952], [609, 3061]],
 [[690, 3262]],
 [[831, 194]],
 [[845, 2770]],
 [[906, 3071]],
 [[1038, 926]],
 [[1055, 6073]],
 [[1163, 860]],
 [[1192, 1153]],
 [[1311, 10592]],
 [[1349, 110]],
 [[1398, 4139]],
 [[1416, 7561]],
 [[1515, 711], [1515, 4147], [1515, 5554]],
 [[1517, 8082], [1517, 15035], [1517, 18565]],
 [[1541, 645]],
 [[1606, 11073]],
 [[1621, 1416],
  [1621, 1478],
  [1621, 5105],
  [1621, 5217],
  [1621, 5233],
  [1621, 5332],
  [1621, 6055],
  [1621, 10285],
  [1621, 12397],
  [1621, 18910],
  [1621, 27802],
  [1621, 32302],
  [1621, 32322],
  [1621, 32388],
  [1621, 32394],
  [1621, 32534],
  [1621, 32557],
  [1621, 32596],
  [1621, 34280],
  [1621, 34572]],
 [[1706, 370]],
 [[1783, 767],
  [1783, 819],
  [1783, 883],
  [1783, 942],
  [1783, 987],
  [1783, 1856],
  [1783, 1903]],
 [[1805, 5329]],
 [[1836, 110