In [1]:
import os
import pickle
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string


In [2]:
def preprocess_text(text):
    """Lowercase, tokenize, remove stopwords and punctuation from text."""
    # Lowercase
    text = text.lower()
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Remove punctuation
    tokens = [word for word in tokens if word.isalpha()]
    return tokens


In [4]:
def create_positional_index(directory):
    positional_index = {}
    for filename in os.listdir(directory):
        path = os.path.join(directory, filename)
        with open(path, 'r', encoding='utf-8') as file:
            text = file.read()
            tokens = preprocess_text(text)
            for position, token in enumerate(tokens):
                if token not in positional_index:
                    positional_index[token] = {}
                if filename not in positional_index[token]:
                    positional_index[token][filename] = []
                positional_index[token][filename].append(position)
    return positional_index

# Assuming 'preprocessed_files' is the directory with preprocessed files
positional_index = create_positional_index('preprocessed_files')

# Save the positional index
with open('positional_index.pkl', 'wb') as f:
    pickle.dump(positional_index, f)


In [7]:
with open('positional_index.pkl', 'rb') as f:
    loaded_positional_index = pickle.load(f)


In [8]:
def process_phrase_query(query, positional_index):
    words = preprocess_text(query)
    if len(words) > 5:
        return "Query length exceeds limit of 5 words", []

    valid_docs = set()
    for word in words:
        if word in positional_index:
            if not valid_docs:
                valid_docs = set(positional_index[word].keys())
            else:
                valid_docs &= set(positional_index[word].keys())
        else:
            return 0, []

    # Filter documents by positional criteria
    filtered_docs = []
    for doc in valid_docs:
        positions = [positional_index[word][doc] for word in words if doc in positional_index[word]]
        for start_pos in positions[0]:
            if all((start_pos + i in pos_list) for i, pos_list in enumerate(positions[1:], start=1)):
                filtered_docs.append(doc)
                break

    return len(filtered_docs), filtered_docs


In [15]:
N = int(input("Enter the number of queries: "))
for i in range(N):
    query = input(f"Enter query {i+1}: ")
    num_docs, docs = process_phrase_query(query, loaded_positional_index)
    print(f"Number of documents retrieved for query {i+1} using positional index: {num_docs}")
    if num_docs > 0:
        print(f"Names of documents retrieved for query {i+1} using positional index: {', '.join(docs)}")
    else:
        print("No documents retrieved for this query using positional index.")
    print()


Number of documents retrieved for query 1 using positional index: 2
Names of documents retrieved for query 1 using positional index: preprocessed_file484.txt, preprocessed_file277.txt

