In [1]:
import nltk
nltk.download ("stopwords")
nltk.download ("punkt")
nltk.download ("wordnet")

[nltk_data] Downloading package stopwords to C:\Users\L E N O V
[nltk_data]     O\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to C:\Users\L E N O V
[nltk_data]     O\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to C:\Users\L E N O V
[nltk_data]     O\AppData\Roaming\nltk_data...


True

In [2]:
import os
import re
import logging
from collections import defaultdict, Counter
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
Lemmatizer = WordNetLemmatizer()

In [3]:
class DocumentProcessor:

    def __init__(self, directory_path):
        self.directory_path = directory_path
        self.documents = {}
        self.doc_mapping = {}

    def load_documents(self):
        print(f"Loading documents from: {self.directory_path}")
        doc_counter = 0

        for file in os.listdir(self.directory_path):
            print(f"Processing: {file}")
            if file.endswith(".txt"):
                file_path = os.path.join(self.directory_path, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    text_content = f.read()
                    self.documents[doc_counter] = text_content
                    self.doc_mapping[doc_counter] = file
                    print(f"Document ID {doc_counter} mapped to {file}")
                    doc_counter += 1

        print(f"Successfully loaded {len(self.documents)} documents")
        return self.documents, self.doc_mapping

    def preprocess_text(self, text):
        text = text.lower()
        text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
        tokens = word_tokenize(text)
        processed_tokens = [lemmatizer.lemmatize(token) for token in tokens if len(token) > 1]
        return processed_tokens


In [4]:
class InvertedIndexBuilder:

    def __init__(self):
        self.index = defaultdict(set)
        self.term_stats = Counter()

    def create_index(self, documents, processor):
        for doc_id, text in documents.items():
            tokens = processor.preprocess_text(text)
            for token in tokens:
                self.index[token].add(doc_id)
                self.term_stats[token] += 1

        return self.index, self.term_stats


In [5]:
def execute_query(self, query_string):
    query_string = query_string.lower()
    query_tokens = query_string.split()
    
    matching_docs = set()
    
    operators = ['and', 'or', 'not']
    search_terms = [token for token in query_tokens if token not in operators]
    
    if 'and' in query_tokens:
        if all(term in self.inverted_index for term in search_terms):
            matching_docs = self.inverted_index[search_terms[0]].copy()
            for term in search_terms[1:]:
                matching_docs &= self.inverted_index[term]
    
    elif 'or' in query_tokens:
        for term in search_terms:
            if term in self.inverted_index:
                matching_docs |= self.inverted_index[term]
    
    elif 'not' in query_tokens:
        excluded_term = query_tokens[1]
        all_document_ids = set(self.doc_mapping.keys())
        if excluded_term in self.inverted_index:
            matching_docs = all_document_ids - self.inverted_index[excluded_term]
        else:
            matching_docs = all_document_ids

In [6]:
def create_query_file(term_statistics, output_path="queries.txt", num_queries=5):
    sample_queries = [
        "update AND feature",
        "android OR window",
        "NOT support"
    ]
    
    with open(output_path, "w", encoding="utf-8") as query_file:
        for q in sample_queries:
            query_file.write(q + "\n")
    
    print(f"Query file created: {output_path}")

In [7]:
def run_search_system():
    docs_folder = r"C:\Users\L E N O V O\anaconda3\Lib\site-packages\nltk"
    doc_processor = DocumentProcessor(docs_folder)
    documents, doc_mapping = doc_processor.load_documents()
    
    for doc_id, content in documents.items():
        tokens = doc_processor.preprocess_text(content)
        print(f"Document {doc_id} preview:", tokens[:20])
    
    index_builder = InvertedIndexBuilder()
    inverted_index, term_stats = index_builder.create_index(documents, doc_processor)
    print("Index preview:", list(inverted_index.keys())[:20])
   
    create_query_file(term_stats)
    
  
    search_engine = BooleanSearchEngine(inverted_index, doc_mapping)
    test_queries = [
        "update AND feature",
        "android OR window",
        "NOT support"
    ]
    

    with open("search_results.txt", 'w', encoding='utf-8') as output:
        for query in test_queries:
            results = search_engine.execute_query(query)
            output_line = f"Query: '{query}' => Results: {results}\n"
            print(output_line)
            output.write(output_line)


if __name__ == "__main__":
    run_search_system()

Loading documents from: C:\Users\L E N O V O\anaconda3\Lib\site-packages\nltk
Processing: app
Processing: book.py
Processing: ccg
Processing: chat
Processing: chunk
Processing: classify
Processing: cli.py
Processing: cluster
Processing: collections.py
Processing: collocations.py
Processing: compat.py
Processing: corpus
Processing: data.py
Processing: decorators.py
Processing: downloader.py
Processing: draw
Processing: featstruct.py
Processing: grammar.py
Processing: help.py
Processing: inference
Processing: internals.py
Processing: jsontags.py
Processing: langnames.py
Processing: lazyimport.py
Processing: lm
Processing: metrics
Processing: misc
Processing: parse
Processing: probability.py
Processing: sem
Processing: sentiment
Processing: stem
Processing: tag
Processing: tbl
Processing: test
Processing: text.py
Processing: tgrep.py
Processing: tokenize
Processing: toolbox.py
Processing: translate
Processing: tree
Processing: treeprettyprinter.py
Processing: treetransforms.py
Processing:

NameError: name 'BooleanSearchEngine' is not defined