In [None]:
import os
import pickle

# Define the directory containing your preprocessed text files
preprocessed_directory = "preprocessed_files"

# Initialize an empty positional index
positional_index = {}

# Populate the positional index
for filename in os.listdir(preprocessed_directory):
    filepath = os.path.join(preprocessed_directory, filename)
    with open(filepath, 'r', encoding='utf-8') as file:
        words = file.read().split()
        for position, word in enumerate(words):
            if word not in positional_index:
                positional_index[word] = {}
            if filename not in positional_index[word]:
                positional_index[word][filename] = []
            positional_index[word][filename].append(position)

# Save the positional index using pickle
with open('positional_index.pkl', 'wb') as f:
    pickle.dump(positional_index, f)


In [None]:
# Load the positional index
with open('positional_index.pkl', 'rb') as f:
    loaded_positional_index = pickle.load(f)


In [None]:
def find_phrase_in_index(phrase, positional_index):
    words = phrase.lower().split()  # Assuming simple lowercase conversion for preprocessing
    if not words:
        return []
    
    # Start with the list of documents for the first word
    if words[0] not in positional_index:
        return []
    common_docs = set(positional_index[words[0]].keys())
    for word in words[1:]:
        if word not in positional_index:
            return []
        docs_with_word = set(positional_index[word].keys())
        common_docs = common_docs.intersection(docs_with_word)
    
    # For each document, check if words appear in sequence
    valid_docs = []
    for doc in common_docs:
        positions = [positional_index[word][doc] for word in words]
        for pos in positions[0]:
            if all((pos + i) in positions[i] for i in range(1, len(words))):
                valid_docs.append(doc)
                break
    
    return valid_docs

# Sample input (for demonstration)
N = 2
queries = [
    "Car bag in a canister",
    "Coffee brewing techniques in cookbook"
]

# Process each query and print results
for i, query in enumerate(queries, start=1):
    result_docs = find_phrase_in_index(query, loaded_positional_index)
    print(f"Number of documents retrieved for query {i} using positional index: {len(result_docs)}")
    if result_docs:
        print(f"Names of documents retrieved for query {i} using positional index: {', '.join(result_docs)}")
    else:
        print("No documents retrieved for this query using positional index.")
    print()  # For better readability between queries


Number of documents retrieved for query 1 using positional index: 0
No documents retrieved for this query using positional index.

Number of documents retrieved for query 2 using positional index: 0
No documents retrieved for this query using positional index.

