In [45]:
import re
import nltk
from pathlib import Path

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
import string
from collections import defaultdict

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\leopu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\leopu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\leopu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
# Step 1: Load the lotr.txt file
file_path = Path("../../lotr.txt")
with open(file_path, "r") as file:
    text = file.read()
    
print(text[:468])

Three Rings for the Elven-kings under the sky,
               Seven for the Dwarf-lords in their halls of stone,
            Nine for Mortal Men doomed to die,
              One for the Dark Lord on his dark throne
           In the Land of Mordor where the Shadows lie.
               One Ring to rule them all, One Ring to find them,
               One Ring to bring them all and in the darkness bind them
           In the Land of Mordor where the Shadows lie.
    


In [27]:
# Step 2: Tokenize the text
tokens = word_tokenize(text)
print(tokens[:10])

['Three', 'Rings', 'for', 'the', 'Elven-kings', 'under', 'the', 'sky', ',', 'Seven']


In [28]:
# Step 3: Linguistic preprocessing
lemmatizer = WordNetLemmatizer()
tokens = [token.lower() for token in tokens if token not in string.punctuation]
tokens = [lemmatizer.lemmatize(token) for token in tokens]
print(tokens[:10])

['three', 'ring', 'for', 'the', 'elven-kings', 'under', 'the', 'sky', 'seven', 'for']


In [None]:
# Step 4: Create the inverted index
inverted_index = defaultdict(list)

# Assuming chapters are separated by double newlines
chapters = text.split('\n\n')

for chapter_id, chapter in enumerate(chapters):
    chapter_tokens = word_tokenize(chapter.lower())
    chapter_tokens = [token for token in chapter_tokens if token not in string.punctuation]
    #chapter_tokens = [lemmatizer.lemmatize(token) for token in chapter_tokens]

    for token in set(chapter_tokens):
        inverted_index[token].append(chapter_id)

inverted_index = dict(inverted_index)

# Print the inverted index
sorted_inverted_index = {k: inverted_index[k] for k in sorted(inverted_index)}
for term, postings in sorted_inverted_index.items():
    print(f"{term}: {postings}")

In [None]:
# Function to retrieve sentences
def retrieve_sentences(text, term, inverted_index):
    chapter_ids = inverted_index.get(term, [])
    print(f"The term {term} is present in these chapters: {chapter_ids}\n\n")
    
    chapters = text.split('\n\n')
    sentences = []
    for chapter_id in chapter_ids:
        if chapter_id < len(chapters):
            chapter = chapters[chapter_id]
            chapter_sentences = sent_tokenize(chapter)
            for sentence in chapter_sentences:
                if term in sentence.lower():
                    sentences.append(sentence)
    return sentences

# Example usage
term = "fellowship"
sentences = retrieve_sentences(text, term, sorted_inverted_index)
for sentence in sentences:
    print(sentence)

The term eagle is present in these chapters: [101, 332, 339, 349, 359, 383, 420, 508, 548, 777, 784]


A mighty eagle swept down and bore him away.
Then she lifted from her lap a great stone of a clear green, set in a silver brooch that was wrought in the likeness of an eagle with outspread wings; and as she held it up the gem flashed like the sun shining through the leaves of spring.
an eagle? '
`It is an eagle, a hunting eagle.
He turned from the North back again to North, and saw nothing save the distant hills, unless it were that far away he could see again a great bird like an eagle high in the air, descending slowly in wide circles down towards the earth.
'There is the eagle again!
Only an eagle could overtake them now.'
          Eagle in eyrie, ox in pasture,
           Hart horn-crowned; hawk is swiftest
           Swan the whitest, serpent coldest...
'There was a darkness over the valleys of the Emyn Muil, and I did not know of their captivity, until the eagle told me.'
'The 

In [50]:
# Functions for boolean operations
def intersect(p1, p2):
    answer = []
    i, j = 0, 0
    while i < len(p1) and j < len(p2):
        if p1[i] == p2[j]:
            answer.append(p1[i])
            i += 1
            j += 1
        elif p1[i] < p2[j]:
            i += 1
        else:
            j += 1
    return answer

def union(p1, p2):
    answer = []
    i, j = 0, 0
    while i < len(p1) and j < len(p2):
        if p1[i] == p2[j]:
            answer.append(p1[i])
            i += 1
            j += 1
        elif p1[i] < p2[j]:
            answer.append(p1[i])
            i += 1
        else:
            answer.append(p2[j])
            j += 1
    # Add remaining elements from p1
    while i < len(p1):
        answer.append(p1[i])
        i += 1
    # Add remaining elements from p2
    while j < len(p2):
        answer.append(p2[j])
        j += 1
    return answer

def difference(p1, p2):
    answer = []
    i, j = 0, 0
    while i < len(p1) and j < len(p2):
        if p1[i] == p2[j]:
            i += 1
            j += 1
        elif p1[i] < p2[j]:
            answer.append(p1[i])
            i += 1
        else:
            j += 1
    # Add remaining elements from p1
    while i < len(p1):
        answer.append(p1[i])
        i += 1
    return answer

# Tokenize the query
def tokenize(query):
    tokens = re.findall(r'\(|\)|AND|OR|NOT|\w+', query.upper())
    return tokens

# Parse the query
def parse(tokens):
    def parse_expression(index):
        node = []
        while index < len(tokens):
            token = tokens[index]
            if token == '(':
                sub_node, index = parse_expression(index + 1)
                node.append(sub_node)
            elif token == ')':
                return node, index + 1
            elif token in ['AND', 'OR', 'NOT']:
                node.append(token)
                index += 1
            else:
                node.append(token)
                index += 1
        return node, index

    ast, _ = parse_expression(0)
    return ast

# Evaluate the AST
def evaluate(ast, inverted_index):
    def evaluate_node(node):
        if isinstance(node, list):
            if len(node) == 1:
                return evaluate_node(node[0])
            elif len(node) == 2:
                operator = node[0]
                operand = evaluate_node(node[1])
                if operator == 'NOT':
                    return difference(all_docs, operand)
            else:
                left = evaluate_node(node[0])
                operator = node[1]
                right = evaluate_node(node[2])
                if operator == 'AND':
                    return intersect(left, right)
                elif operator == 'OR':
                    return union(left, right)
        else:
            return inverted_index.get(node.lower(), [])

    all_docs = list(set().union(*inverted_index.values()))
    return evaluate_node(ast)

# Function to search with boolean operations
def search(query, inverted_index, text):
    tokens = tokenize(query)
    ast = parse(tokens)
    result = evaluate(ast, inverted_index)

    # Retrieve the actual text from the chapters
    chapter_ids = result
    print(f"The terms are present in these chapters: {chapter_ids}\n\n")

    chapters = text.split('\n\n')
    sentences = []
    for chapter_id in chapter_ids:
        if chapter_id < len(chapters):
            chapter = chapters[chapter_id]
            chapter_sentences = sent_tokenize(chapter)
            for sentence in chapter_sentences:
                # Check if any of the query terms are in the sentence
                if any(term.lower() in sentence.lower() for term in tokens if term not in ['AND', 'OR', 'NOT', '(', ')']):
                    sentences.append(sentence)

    return sentences

In [None]:
# Example usage: Gandalf and Frodo but not Elrond
query = "(gandalf AND frodo) AND NOT elrond"
sentences = search(query, inverted_index, text)
for sentence in sentences:
    print(sentence)

The terms are present in these chapters: [1, 12, 21, 25, 27, 28, 30, 31, 33, 34, 35, 37, 42, 44, 45, 46, 48, 49, 51, 61, 63, 72, 77, 79, 101, 105, 107, 123, 143, 145, 153, 154, 155, 168, 175, 176, 186, 188, 198, 199, 203, 204, 205, 206, 225, 227, 230, 240, 241, 242, 243, 244, 249, 251, 258, 259, 261, 264, 265, 266, 267, 268, 269, 270, 271, 273, 274, 276, 283, 287, 290, 291, 292, 293, 297, 319, 328, 330, 340, 342, 349, 363, 366, 372, 420, 487, 547, 552, 575, 581, 583, 587, 597, 602, 627, 645, 651, 671, 672, 680, 681, 751, 753, 773, 775, 777, 778, 780, 792, 794, 796, 799, 802, 806, 808, 817, 833]


The process had begun in the writing of _The Hobbit,_ in which there were already some references to the older matter: Elrond, Gondolin, the High-elves, and the orcs, as well as glimpses that had arisen unbidden of things higher or deeper or darker than its surface: Durin, Moria, Gandalf, the Necromancer, the Ring.
It was during 1944 that, leaving the loose ends and perplexities of a war which

In [54]:
# Example usage: Gandalf and Frodo but not Elrond
query = "(gandalf AND rivendell) AND NOT eagle"
sentences = search(query, inverted_index, text)
for sentence in sentences:
    print(sentence)

The terms are present in these chapters: [48, 61, 153, 154, 175, 186, 188, 199, 203, 205, 206, 225, 240, 242, 249, 251, 258, 259, 269, 273, 276, 283, 297, 319, 328, 342, 363, 372, 393, 420, 427, 484, 502, 581, 627, 665, 666, 777, 796, 801, 802]


     'You ought to go quietly, and you ought to go soon,' said Gandalf.
said Gandalf.
But he did not tell all his thoughts to Gandalf.
said Gandalf.
'But you cannot see very far,' said Gandalf.
'If you want my advice, make for Rivendell.
'Rivendell!'
'Very good: I will go east, and I will make for Rivendell.
'It is,' said Frodo; 'but I thought my going was a secret known only to Gandalf and my faithful Sam.'
'Has Gandalf told you nothing?'
My plan was to leave the Shire secretly, and make my way to Rivendell; but now my footsteps are dogged, before ever I get to Buckland.'
But if you desire clearer counsel, you should ask Gandalf.
These things Gandalf must know.
I have been expecting Gandalf for many days.
'That Gandalf should be late, does no