In [1]:
import re
import os

class InvertedIndex:
    def __init__(self):
        self.index = {}
        self.documents = {}
        self.document_titles = {} 
        self.stopwords = set(["the", "is", "in", "and", "to", "a", "of", "on", "with", "it", "for", "as", "by", "an", "can", "from"])

    def create_indexer(self, doc_id, title, text):
        """Add or update a document in the index."""
        # CASE FOLDING AND TOKENIZATION
        tokens = self._tokenize(text)
        if doc_id in self.documents:
            self.remove_document(doc_id)

        self.documents[doc_id] = text
        self.document_titles[doc_id] = title

        # CREATING INDEXER
        for token in tokens:
            if token not in self.index:
                self.index[token] = {}
            if doc_id not in self.index[token]:
                self.index[token][doc_id] = 0
            self.index[token][doc_id] += 1

    def add_single_document(self, file_path):
        """Add a single document from a file."""
        # Pre processing
        with open(file_path, 'r', encoding='utf-8') as file:
            title = os.path.splitext(os.path.basename(file_path))[0]  # Extract title from file name
            content = file.read()
            doc_id = hash(file_path)  # Use a hash of the file path as a unique ID
            # Indexer
            self.create_indexer(doc_id, title, content)

    def add_documents_from_directory(self, directory_path):
        """Adds all text files from a directory to the index."""
        for file_name in os.listdir(directory_path):
            if file_name.endswith(".txt"):
                file_path = os.path.join(directory_path, file_name)
                self.add_single_document(file_path)

    def _case_folding(self, text):
        """Convert text to lowercase."""
        return text.lower()

    def _tokenization(self, text):
        """Split text into words."""
        return re.findall(r'\b\w+\b', text)

    def _remove_stopwords(self, words):
        """Remove stopwords from a list of words."""
        return [word for word in words if word not in self.stopwords]

    def _tokenize(self, text):
        """Perform all steps: case folding, tokenization, and stopword removal."""
        lower_text = self._case_folding(text)
        words = self._tokenization(lower_text)
        return self._remove_stopwords(words)


    def search(self, word):
        """Search for a word in the index."""
        word = word.lower()
        if word in self.index:
            results = self.index[word]
            return {self.document_titles[doc_id]: count for doc_id, count in results.items()}
        return {}

    def remove_document(self, doc_id):
        """Remove a document from the index."""
        if doc_id in self.documents:
            tokens = self._tokenize(self.documents[doc_id])
            for token in tokens:
                if token in self.index and doc_id in self.index[token]:
                    del self.index[token][doc_id]
                    if not self.index[token]:
                        del self.index[token]
            del self.documents[doc_id]
            del self.document_titles[doc_id]
    
    def print_index(self):
        """Print the entire index in a meaningful way."""
        print("Final Index:")
        for word, doc_data in self.index.items():
            doc_info = {self.document_titles[doc_id]: count for doc_id, count in doc_data.items()}
            print(f"{word}: {doc_info}")

    def search_document_by_title(self, title):
        """Search and return the content of a document by its title."""
        for doc_id, doc_title in self.document_titles.items():
            if doc_title.lower() == title.lower():
                return self.documents[doc_id]
        return f"Document with title '{title}' not found."
    

# if __name__ == "__main__":

In [2]:
index = InvertedIndex()

In [3]:
# Add all documents from a specified directory
directory_path = "./Documents_01"
index.add_documents_from_directory(directory_path)

# Search for words
print("Search results for 'fox':", index.search("fox"))
print("Search results for 'python':", index.search("python"))
print("Search results for 'exercise':", index.search("exercise"))
print("Search results for 'adventure':", index.search("adventure"))

Search results for 'fox': {'Nature and Wildlife': 1, 'Space Exploration': 1}
Search results for 'python': {'Technology and Programming': 1}
Search results for 'exercise': {'Health and Fitness': 2}
Search results for 'adventure': {'Travel and Adventure': 1}


In [4]:
# Add a newly added document
index.add_single_document("./Documents_01/Space Exploration.txt")
print("Search results for 'fox':", index.search("fox"))
print("Search results for 'space':", index.search("space"))

Search results for 'fox': {'Nature and Wildlife': 1, 'Space Exploration': 1}
Search results for 'space': {'Space Exploration': 1}


In [5]:
# Print the entire indexer
print("\nFinal Index:")
index.print_index()


Final Index:
Final Index:
healthy: {'Health and Fitness': 1}
lifestyle: {'Health and Fitness': 1}
includes: {'Health and Fitness': 1}
regular: {'Health and Fitness': 1}
exercise: {'Health and Fitness': 2}
balanced: {'Health and Fitness': 1}
diet: {'Health and Fitness': 1}
sufficient: {'Health and Fitness': 1}
sleep: {'Health and Fitness': 1}
help: {'Health and Fitness': 1}
reduce: {'Health and Fitness': 1}
stress: {'Health and Fitness': 1}
improve: {'Health and Fitness': 1}
overall: {'Health and Fitness': 1}
well: {'Health and Fitness': 1}
being: {'Health and Fitness': 1}
quick: {'Nature and Wildlife': 1}
brown: {'Nature and Wildlife': 1}
fox: {'Nature and Wildlife': 1, 'Space Exploration': 1}
jumps: {'Nature and Wildlife': 1}
over: {'Nature and Wildlife': 1}
lazy: {'Nature and Wildlife': 1}
dog: {'Nature and Wildlife': 1}
forest: {'Nature and Wildlife': 1}
full: {'Nature and Wildlife': 1}
wonderful: {'Nature and Wildlife': 1}
creatures: {'Nature and Wildlife': 1}
including: {'Nature 

In [6]:
# Search for a document by title
doc_title = "Nature and Wildlife"  # Replace with the title you want to search
document_content = index.search_document_by_title(doc_title)
print(f"Content of '{doc_title}':\n{document_content}")

Content of 'Nature and Wildlife':
The quick brown fox jumps over the lazy dog. The forest is full of wonderful creatures, including foxes and deer.

