# Step 1: Create a Unigram Inverted Index

In [3]:
import os
import pickle

# Assuming 'directory' contains the preprocessed files
preprocessed_directory = "preprocessed_files"

# Initialize an empty inverted index
inverted_index = {}

# Populate the inverted index
for filename in os.listdir(preprocessed_directory):
    filepath = os.path.join(preprocessed_directory, filename)
    with open(filepath, 'r', encoding='utf-8') as file:
        for word in file.read().split():
            if word in inverted_index:
                inverted_index[word].add(filename)
            else:
                inverted_index[word] = {filename}

# Save the inverted index using pickle
with open('inverted_index.pkl', 'wb') as f:
    pickle.dump(inverted_index, f)


# Step 2: Load the Inverted Index

In [4]:
# Load the inverted index
with open('inverted_index.pkl', 'rb') as f:
    loaded_inverted_index = pickle.load(f)



# Step 3: Define Boolean Query Operations


In [5]:
def perform_and(set1, set2):
    return set1.intersection(set2)

def perform_or(set1, set2):
    return set1.union(set2)

def perform_and_not(set1, set2):
    return set1 - set2

def perform_or_not(set1, set2, all_docs):
    return set1.union(all_docs - set2)

# Helper function to get all document names
def get_all_document_names(directory):
    return set(os.listdir(directory))


# Step 4: Process Queries

In [16]:
def process_query(query, operations, all_docs, inverted_index):
    # Preprocess and tokenize the query
    preprocessed_query = query.lower()  # Assuming simple lowercase conversion for demo
    tokens = preprocessed_query.split()  # This would ideally use the same preprocessing as your documents
    
    # Create a formatted query for output
    formatted_query = ""
    for i, token in enumerate(tokens):
        formatted_query += token
        if i < len(operations):
            formatted_query += " " + operations[i] + " "
    
    # Convert tokens to their document sets
    query_sets = [inverted_index.get(token, set()) for token in tokens]
    
    # Initialize the result set based on the first token's document set
    result_set = query_sets[0] if query_sets else set()
    
    # Apply operations with subsequent tokens
    for op, next_set in zip(operations, query_sets[1:]):
        if op.strip() == "AND":
            result_set = perform_and(result_set, next_set)
        elif op.strip() == "OR":
            result_set = perform_or(result_set, next_set)
        elif op.strip() == "AND NOT":
            result_set = perform_and_not(result_set, next_set)
        elif op.strip() == "OR NOT":
            result_set = perform_or_not(result_set, next_set, all_docs)
            
    return formatted_query, result_set

# Example static input structure for demonstration
N = int(input("Enter the number of queries: "))
queries = []
operations_list = []

for i in range(N):
    query = input(f"Enter query {i+1}: ")
    operations = input("Enter operations separated by commas (e.g., AND,OR NOT): ").split(', ')
    queries.append(query)
    operations_list.append(operations)

all_docs = set(get_all_document_names(preprocessed_directory))

for i, (query, operations) in enumerate(zip(queries, operations_list), start=1):
    formatted_query, result_docs = process_query(query, operations, all_docs, loaded_inverted_index)
    print(f"Query {i}: {formatted_query}")
    print(f"Number of documents retrieved for query {i}: {len(set(result_docs))}")
    if result_docs:
        print(f"Names of the documents retrieved for query {i}: {', '.join(result_docs)}\n")
    else:
        print("No documents retrieved for this query.\n")





Query 1: acoustic OR NOT guitar
Number of documents retrieved for query 1: 784
Names of the documents retrieved for query 1: preprocessed_file407.txt, preprocessed_file949.txt, preprocessed_file115.txt, preprocessed_file78.txt, preprocessed_file200.txt, preprocessed_file389.txt, preprocessed_file212.txt, preprocessed_file134.txt, preprocessed_file673.txt, preprocessed_file206.txt, preprocessed_file348.txt, preprocessed_file761.txt, preprocessed_file133.txt, preprocessed_file197.txt, preprocessed_file419.txt, preprocessed_file270.txt, preprocessed_file672.txt, preprocessed_file953.txt, preprocessed_file606.txt, preprocessed_file937.txt, preprocessed_file747.txt, preprocessed_file443.txt, preprocessed_file293.txt, preprocessed_file80.txt, preprocessed_file466.txt, preprocessed_file275.txt, preprocessed_file84.txt, preprocessed_file997.txt, preprocessed_file33.txt, preprocessed_file751.txt, preprocessed_file952.txt, preprocessed_file859.txt, preprocessed_file850.txt, preprocessed_file533.