In [None]:
import os
from collections import defaultdict

class IndexSegment:
    def __init__(self):
        self.index = defaultdict(list)

    def add_document(self, doc_id, content):
        terms = content.lower().split()
        for position, term in enumerate(terms):
            self.index[term].append((doc_id, position))

    def merge(self, other):
        for term, postings in other.index.items():
            self.index[term].extend(postings)
        return self


class DynamicIndex:
    def __init__(self, merge_threshold=2, buffer_size=5):
        self.segments = []
        self.merge_threshold = merge_threshold
        self.buffer_size = buffer_size
        self.buffer = IndexSegment()
        self.buffer_count = 0

    def add_document(self, doc_id, content):
        self.buffer.add_document(doc_id, content)
        self.buffer_count += 1

        if self.buffer_count >= self.buffer_size:
            self._flush_buffer()

    def _flush_buffer(self):
        if self.buffer_count > 0:
            self.segments.append(self.buffer)
            self.buffer = IndexSegment()
            self.buffer_count = 0
            self._merge_segments()

    def _merge_segments(self):
        i = 0
        while i < len(self.segments) - 1:
            if len(self.segments) - i >= self.merge_threshold:
                merged = self.segments[i]
                for j in range(1, self.merge_threshold):
                    merged = merged.merge(self.segments[i + j])
                self.segments[i] = merged
                del self.segments[i + 1 : i + self.merge_threshold]
            i += 1

    def search(self, query):
        results = []
        for segment in self.segments:
            results.extend(segment.index.get(query.lower(), []))

        results.extend(self.buffer.index.get(query.lower(), []))
        return results

    def update_document(self, doc_id, new_content):

        for segment in self.segments:
            segment.index = defaultdict(list, {term: [(d, p) for d, p in postings if d != doc_id]
                                               for term, postings in segment.index.items()})
        self.buffer.index = defaultdict(list, {term: [(d, p) for d, p in postings if d != doc_id]
                                               for term, postings in self.buffer.index.items()})
        self.add_document(doc_id, new_content)
def main():
    # Initialize the DynamicIndexing with a merge threshold of 10 and buffer size of 3
    merge_threshold = 10
    buffer_size = 3
    dynamic_index = DynamicIndex(merge_threshold, buffer_size)

    # Sample documents (normally read from a file or database)
    documents = {
        1: "Dynamic indexing algorithms help optimize search.",
        2: "Search engines use indexing to retrieve documents efficiently.",
        3: "Indexing techniques vary based on data and usage patterns.",
        4: "Some indexing methods are dynamic, merging data over time.",
    }

    # Adding documents to the index
    for doc_id, content in documents.items():
        dynamic_index.add_document(doc_id, content)

    # Search for a term in the index
    search_term = "indexing"
    result = dynamic_index.search(search_term)

    print(f"Documents containing '{search_term}': {result}")

if __name__ == "__main__":
    main()

Documents containing 'indexing': [(1, 1), (2, 3), (3, 0), (4, 1)]


**Explanation of every Method**



1.   ___init_ (Constructor):
     
     Initializes the dynamic index with a merge threshold and buffer size. The buffer holds documents temporarily, while the index stores tokens and their associated document IDs.

2.   add_document(doc_id, content):
     
     Adds a document to the buffer. Each document is tokenized, and when the buffer reaches the buffer_size, it triggers the merge_into_index method.

3.   merge_into_index():
     
     Merges the contents of the buffer into the main index. Each token from the buffer is added to the index, and document IDs are stored in the index under the corresponding token. The buffer is cleared after merging.

4.   search(term):
     
     Searches for a term in the index and returns the list of document IDs containing that term.





**Explanation of merge_threshold and buffer_size**

1.   merge_threshold:
     
     This could refer to the total number of documents or updates before the index is merged or optimized. In the current context, we can treat it as a threshold to perform larger-scale merges.

2. buffer_size:
     
     The number of documents that can be held in memory (buffer) before they are written to disk (or, in this case, merged into the index). When the buffer reaches its size, we flush its contents into the main index.



In [None]:
import os

class DynamicIndexing:
    def __init__(self, merge_threshold, buffer_size):
        # Initializing merge threshold and buffer size for dynamic indexing
        self.merge_threshold = merge_threshold
        self.buffer_size = buffer_size
        self.buffer = []  # Buffer to hold incoming data before merging into index
        self.index = {}  # The actual index structure

    def add_document(self, doc_id, content):
        """
        Add a document to the buffer. When the buffer reaches the merge threshold,
        flush the buffer to the index.
        """
        tokens = content.split()  # Simple tokenization of the content
        self.buffer.append((doc_id, tokens))

        # Check if buffer has reached the threshold to merge into the index
        if len(self.buffer) >= self.buffer_size:
            self.merge_into_index()

    def merge_into_index(self):
        """
        Merge the buffer into the main index when buffer size reaches the threshold.
        """
        for doc_id, tokens in self.buffer:
            for token in tokens:
                if token not in self.index:
                    self.index[token] = []
                self.index[token].append(doc_id)
        # Clear the buffer after merging
        self.buffer.clear()

    def search(self, term):
        """
        Search for a term in the index and return document IDs containing the term.
        """
        if term in self.index:
            return self.index[term]
        else:
            return []

def read_files_from_directory(directory_path):
    """
    Recursively read all text files from the directory and its subdirectories.
    Returns a dictionary where keys are filenames and values are the file content.
    """
    documents = {}
    for root, dirs, files in os.walk(directory_path):  # Traverse directories recursively
        for filename in files:
            if filename.endswith(".txt"):  # Only process .txt files
                file_path = os.path.join(root, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    documents[file_path] = file.read()  # Use the file path as doc_id
    return documents

def main():
    # Initialize the DynamicIndexing with a merge threshold and buffer size
    merge_threshold = 1000  # Adjust based on the total number of documents
    buffer_size = 10  # Adjust based on memory limits
    dynamic_index = DynamicIndexing(merge_threshold, buffer_size)

    # Define the parent directory containing the 10 subfolders with text files
    parent_directory_path = "/content/drive/MyDrive/IR WEEK-3"  # Replace with actual parent folder path

    # Read all text files from the subfolders
    documents = read_files_from_directory(parent_directory_path)

    # Add documents from the files to the index
    for doc_id, content in documents.items():
        dynamic_index.add_document(doc_id, content)

    # Search for a term in the index
    n=int(input("Enter number of words you want to search:"))
    while(n):
      search_term = input("Enter a search term: ")
      result = dynamic_index.search(search_term)
      print(f"Documents containing '{search_term}': {result}")
      n-=1

if __name__ == "__main__":
    main()


Enter number of words you want to search:3
Enter a search term: hi
Documents containing 'hi': []
Enter a search term: money
Documents containing 'money': ['/content/drive/MyDrive/IR WEEK-3/merged_folder/historical_86_7651.txt', '/content/drive/MyDrive/IR WEEK-3/merged_folder/historical_86_7651.txt', '/content/drive/MyDrive/IR WEEK-3/merged_folder/historical_94_6374.txt', '/content/drive/MyDrive/IR WEEK-3/merged_folder/historical_94_6374.txt', '/content/drive/MyDrive/IR WEEK-3/merged_folder/historical_94_6374.txt', '/content/drive/MyDrive/IR WEEK-3/merged_folder/technologie_67_4950.txt', '/content/drive/MyDrive/IR WEEK-3/merged_folder/technologie_67_4950.txt', '/content/drive/MyDrive/IR WEEK-3/merged_folder/technologie_77_5548.txt', '/content/drive/MyDrive/IR WEEK-3/merged_folder/technologie_27_4553.txt', '/content/drive/MyDrive/IR WEEK-3/merged_folder/technologie_27_4553.txt', '/content/drive/MyDrive/IR WEEK-3/merged_folder/technologie_12_1506.txt', '/content/drive/MyDrive/IR WEEK-3/me