In [1]:
import os
import re
from collections import defaultdict

# Path to the folder containing the documents
folder_path = '/content/drive/MyDrive/lab3'

# Function to read all documents from the folder
def load_documents(folder_path):
    documents = []
    doc_names = os.listdir(folder_path)
    for doc_name in doc_names:
        doc_path = os.path.join(folder_path, doc_name)
        with open(doc_path, 'r', encoding='utf-8') as file:
            documents.append(file.read())
    return documents, doc_names

# Function to tokenize the document
def tokenize(text):
    # Convert to lowercase and remove punctuation
    tokens = re.findall(r'\b\w+\b', text.lower())
    return tokens

# Load and tokenize all documents
documents, doc_names = load_documents(folder_path)
tokenized_documents = [tokenize(doc) for doc in documents]


In [2]:
def build_inverted_index(tokenized_docs):
    index = defaultdict(lambda: defaultdict(list))

    # Build the index
    for doc_id, tokens in enumerate(tokenized_docs):
        for position, token in enumerate(tokens):
            index[token][doc_id].append(position)

    return index

# Building the index
inverted_index = build_inverted_index(tokenized_documents)

# Optional: Save the index to a file if needed
# import pickle
# with pickle.dump(open('inverted_index.pkl', 'wb'), inverted_index)

# Output a summary of the index (for debugging)
print(f"Inverted index contains {len(inverted_index)} terms.")


Inverted index contains 8380 terms.


In [3]:
def execute_phrasal_query(query, index):
    query_terms = tokenize(query)
    if not query_terms:
        return []

    # Get the initial set of documents containing the first term
    possible_docs = index[query_terms[0]]

    for i, term in enumerate(query_terms[1:], start=1):
        next_possible_docs = index[term]

        # Check for sequential positions
        valid_docs = defaultdict(list)
        for doc_id, positions in possible_docs.items():
            if doc_id in next_possible_docs:
                for pos in positions:
                    if pos + 1 in next_possible_docs[doc_id]:
                        valid_docs[doc_id].append(pos + 1)

        possible_docs = valid_docs

    return [doc_names[doc_id] for doc_id in possible_docs.keys()]

# Example usage
query = "quick brown fox"
matching_docs = execute_phrasal_query(query, inverted_index)
print(f"Documents matching the query '{query}': {matching_docs}")


Documents matching the query 'quick brown fox': []


In [4]:
query = "Business"
matching_docs = execute_phrasal_query(query, inverted_index)
print(f"Documents matching the query '{query}': {matching_docs}")

Documents matching the query 'Business': ['business_93.txt', 'business_99.txt', 'entertainment_89.txt', 'entertainment_93.txt', 'graphics_82.txt', 'historical_8.txt', 'historical_80.txt', 'medical_586.txt', 'politics_271.txt', 'politics_34.txt', 'technologie_93.txt']


In [5]:
query = "food"
matching_docs = execute_phrasal_query(query, inverted_index)
print(f"Documents matching the query '{query}': {matching_docs}")

Documents matching the query 'food': ['business_93.txt', 'business_91.txt', 'food_84.txt', 'food_90.txt', 'food_86.txt', 'food_88.txt', 'food_87.txt', 'food_83.txt', 'food_85.txt', 'food_89.txt', 'historical_78.txt', 'space_96.txt', 'technologie_88.txt']


In [6]:
query = " and/or electronics to"
matching_docs = execute_phrasal_query(query, inverted_index)
print(f"Documents matching the query '{query}': {matching_docs}")

Documents matching the query ' and/or electronics to': ['technologie_9.txt']


In [7]:
query = "whether the fixtures"
matching_docs = execute_phrasal_query(query, inverted_index)
print(f"Documents matching the query '{query}': {matching_docs}")

Documents matching the query ' whether the fixtures': ['sport_99.txt']
