In [10]:
import re
import nltk
from pathlib import Path

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
import string
from collections import defaultdict
import spacy

# Load the SpaCy model
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\leopu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\leopu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\leopu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\leopu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Choosing a document unit

In [11]:
# Step 1: Load the lotr.txt file
file_path = Path("../lotr.txt")
with open(file_path, "r") as file:
    text = file.read()
    
print(text[:468])

chapters = text.split("\n\n")

Three Rings for the Elven-kings under the sky,
               Seven for the Dwarf-lords in their halls of stone,
            Nine for Mortal Men doomed to die,
              One for the Dark Lord on his dark throne
           In the Land of Mordor where the Shadows lie.
               One Ring to rule them all, One Ring to find them,
               One Ring to bring them all and in the darkness bind them
           In the Land of Mordor where the Shadows lie.
    


### Normalization

In [12]:
# Step 2: Define functions for tokenization and normalization
def normalize_text(tokens):
    tokens = [word.lower() for word in tokens]
    tokens = [word for word in tokens if word not in string.punctuation]
    return tokens

### Extended biword index

In [20]:
# Step 3: Create an extended biword index
extended_biword_index = defaultdict(list)

for doc_id, chapter in enumerate(chapters):
    doc = nlp(chapter)
    tokens = [token.text for token in doc]
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    normalized_tokens = normalize_text(filtered_tokens)
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in normalized_tokens]

    # Perform part-of-speech tagging
    pos_tags = [token.pos_ for token in doc]

    # Group terms into extended biwords
    extended_biwords = []
    i = 0
    while i < len(lemmatized_tokens):
        if pos_tags[i] == 'NOUN' or pos_tags[i] == 'PROPN':
            biword = lemmatized_tokens[i]
            i += 1
            extended_biwords.append(biword)
        else:
            i += 1

    # Add extended biwords to the index
    for biword in extended_biwords:
        extended_biword_index[biword].append(doc_id)

In [21]:
extended_biword_index

defaultdict(list,
            {'three': [0,
              5,
              6,
              10,
              11,
              14,
              33,
              49,
              60,
              175,
              188,
              200,
              225,
              225,
              230,
              234,
              276,
              287,
              297,
              318,
              340,
              355,
              361,
              363,
              392,
              398,
              476,
              484,
              507,
              528,
              596,
              610,
              653,
              662,
              662,
              662,
              713,
              757,
              771,
              780,
              811,
              814,
              833],
             'ring': [0,
              0,
              1,
              1,
              1,
              1,
              1,
              1,
              11,
     

### Usage

In [22]:
# Step 4: Define a function to process phrase queries
def process_phrase_query(query, extended_biword_index):
    doc = nlp(query)
    tokens = [token.text for token in doc]
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    normalized_tokens = normalize_text(filtered_tokens)
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in normalized_tokens]

    # Perform part-of-speech tagging
    pos_tags = [token.pos_ for token in doc]

    # Group terms into extended biwords
    extended_biwords = []
    i = 0
    while i < len(lemmatized_tokens):
        if pos_tags[i] == 'NOUN' or pos_tags[i] == 'PROPN':
            biword = lemmatized_tokens[i]
            i += 1
            extended_biwords.append(biword)
        else:
            i += 1

    # Find documents that contain all the extended biwords in the query
    result_docs = None
    for biword in extended_biwords:
        if biword in extended_biword_index:
            if result_docs is None:
                result_docs = set(extended_biword_index[biword])
            else:
                result_docs.intersection_update(extended_biword_index[biword])

    return list(result_docs) if result_docs else []

In [None]:
# Example usage: Search for a phrase
phrase_query = "fellowship of the ring"
result_docs = process_phrase_query(phrase_query, extended_biword_index)
print(f"Documents containing the phrase '{phrase_query}': {result_docs}")

for r in result_docs: 
    print("")

Documents containing the phrase 'fellowship of the ring': [780, 14, 575]
