In [1]:
import os

class EmptyDirectoryError(BaseException):
    pass

class DirectoryIndex:
    
    def __init__(self, dirpath: str, encoding: str='utf-8'):
        self.dirpath = dirpath
        self.encoding = encoding
        self.__last_number = None
        self.__n_texts = None
        self.__unique_words_dict = None
        self.__word_in_text = None
        self.__index = None
    
    def _read_file(self, file_name):
        unique_words = set()
        with open(f"{self.dirpath}{file_name}", "r") as file:
            for string in file:
                string = string.lower().split()
                #string = "".join(list(map(lambda char: char if (char.isalpha() or char.isspace()) else "", string.lower()))).split()
                for word in string:
                    unique_words.add(word)
        return unique_words
    
    def find_documents(self, word: str) -> list:
        try:
            word_id = self.__unique_words_dict[word]
        except BaseException:
            return []
        
        return self.__index[word_id]
    
    def update(self, filepath: str, encoding: str='utf-8'):
        unique = self._read_file(filepath)
        for word in unique:
            if word not in self.__unique_words_dict:
                self.__unique_words_dict[word] = self.__last_number
                self.__last_number += 1
            word_id = self.__unique_words_dict[word]
            if word_id not in self.__index:
                self.__index[word_id] = []
            if filepath not in self.__index[word_id]:
                self.__index[word_id].append(filepath)
    
    @property
    def inverted_index(self):
        all_files = [f for f in os.listdir(self.dirpath) if os.path.isfile(os.path.join(self.dirpath, f))]
        if len(all_files) == 0:
            raise EmptyDirectoryError
        af0 = all_files.copy()
        unique_words_dict = {}
        word_in_text = set()
        inversed_index = {}
        word_id = 0
        
        while af0:
            file = af0.pop()
            temp_words = self._read_file(file)
            for w in temp_words:
                if w not in unique_words_dict:
                    unique_words_dict[w] = word_id
                word_id += 1
                
        self.__last_number = word_id + 1
                
        for file in all_files:
            temp_words = self._read_file(file)
            for word in temp_words:
                word_in_text.add((unique_words_dict[word], file))
        
        for word_id, text_id in word_in_text:
            if word_id not in inversed_index:
                inversed_index[word_id] = []
            inversed_index[word_id].append(text_id)
        
        self.__unique_words_dict = unique_words_dict
        self.__word_in_text = word_in_text        
        self.__index = inversed_index
            

In [8]:
dindex = DirectoryIndex("../dataset/")

In [9]:
dindex.inverted_index

In [10]:
arr = dindex.find_documents('shakespeare')
sorted(map(int, arr))

[0, 1, 2, 3, 7, 12, 15, 19, 20, 30, 33, 34, 39, 44]

In [None]:
dindex.update("bom")

In [None]:
dindex.find_documents('bom')