Block-based indexing is a technique used to handle large datasets that cannot fit into memory all at once. The idea is to divide the data into smaller, manageable blocks, process each block individually, and then write the results to disk. This allows you to work with large datasets without running into memory limitations.

Here's a simple breakdown:
- Divide the Data: Split the data into smaller blocks that can fit into memory.
- Process Each Block: Process each block individually (e.g., create an inverted index).
- Write to Disk: Write the processed blocks to disk.
- Merge: After processing all blocks, merge the results to create the final index.

In [4]:
import os
from util.load_lotr import lotr_chapters

chapters = lotr_chapters() # this loads a list of strings

# Function to parse the next block of documents
def parse_next_block(documents, block_size):
    block = documents[:block_size]
    documents = documents[block_size:]
    return block, documents

# Function to invert a block (create an inverted index)
def bsbi_invert(block):
    inverted_index = {}
    for doc in block:
        for word in doc.split():
            if word not in inverted_index:
                inverted_index[word] = []
            inverted_index[word].append(doc)
    return inverted_index

# Function to write a block to disk
def write_block_to_disk(block, filename):
    with open(filename, 'w') as file:
        for doc in block:
            file.write(doc + '\n')

# Function to merge blocks
def merge_blocks(filenames, merged_filename):
    merged_index = {}
    for filename in filenames:
        with open(filename, 'r') as file:
            for line in file:
                doc = line.strip()
                for word in doc.split():
                    if word not in merged_index:
                        merged_index[word] = []
                    merged_index[word].append(doc)
    with open(merged_filename, 'w') as file:
        for word, docs in merged_index.items():
            file.write(f"{word}: {', '.join(docs)}\n")

# Main function to perform block-based indexing
def bsbindex_construction(documents, block_size):
    n = 0
    filenames = []
    os.makedirs("blocks", exist_ok=True)  # Create the "blocks" folder if it doesn't exist
    while documents:
        n += 1
        block, documents = parse_next_block(documents, block_size)
        inverted_block = bsbi_invert(block)
        filename = f"blocks/block_{n}.txt"
        write_block_to_disk(block, filename)
        filenames.append(filename)
    merged_filename = "merged_index.txt"
    merge_blocks(filenames, merged_filename)
    print(f"Merged index written to {merged_filename}")

# Example usage
documents = chapters
block_size = 20
bsbindex_construction(documents, block_size)

Merged index written to merged_index.txt
