In [1]:
import re
import nltk
from pathlib import Path

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
import string
from collections import defaultdict

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\leopu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\leopu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\leopu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\leopu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Choosing a document unit

In [2]:
# Step 1: Load the lotr.txt file
file_path = Path("../../lotr.txt")
with open(file_path, "r") as file:
    text = file.read()
    
print(text[:468])

chapters = text.split("\n\n")

Three Rings for the Elven-kings under the sky,
               Seven for the Dwarf-lords in their halls of stone,
            Nine for Mortal Men doomed to die,
              One for the Dark Lord on his dark throne
           In the Land of Mordor where the Shadows lie.
               One Ring to rule them all, One Ring to find them,
               One Ring to bring them all and in the darkness bind them
           In the Land of Mordor where the Shadows lie.
    


### Tokenization

In [3]:
# Step 2: Tokenize the text
tokens = word_tokenize(text)
print(tokens[:10])

['Three', 'Rings', 'for', 'the', 'Elven-kings', 'under', 'the', 'sky', ',', 'Seven']


### Stop words

In [4]:
# Step 3: Stop Word Removal
stop_words = set(stopwords.words('english'))
print(list(stop_words)[:10])

filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print(filtered_tokens[:10])

['d', "she'd", 'hers', 'did', 'wasn', 'then', 'after', 'hasn', 'up', "they're"]
['Three', 'Rings', 'Elven-kings', 'sky', ',', 'Seven', 'Dwarf-lords', 'halls', 'stone', ',']


### Normalization

In [5]:
# Step 4: Normalization
def normalize_text(tokens):
    # Convert to lowercase
    tokens = [word.lower() for word in tokens]
    # Remove punctuation
    tokens = [word for word in tokens if word not in string.punctuation]
    return tokens

normalized_tokens = normalize_text(filtered_tokens)
print(normalized_tokens[:10])

['three', 'rings', 'elven-kings', 'sky', 'seven', 'dwarf-lords', 'halls', 'stone', 'nine', 'mortal']


### Stemming and lemmatization

In [6]:
# Step 5: Stemming and Lemmatization
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

lemmatized_tokens = [lemmatizer.lemmatize(word) for word in normalized_tokens]
stemmed_tokens = [stemmer.stem(word) for word in normalized_tokens]

print("Lemmatized Tokens:", lemmatized_tokens[:10])
print("Stemmed Tokens:", stemmed_tokens[:10])

Lemmatized Tokens: ['three', 'ring', 'elven-kings', 'sky', 'seven', 'dwarf-lords', 'hall', 'stone', 'nine', 'mortal']
Stemmed Tokens: ['three', 'ring', 'elven-k', 'sky', 'seven', 'dwarf-lord', 'hall', 'stone', 'nine', 'mortal']


### Creating an inverted index

In [7]:
# Step 6: Create a Positional Index
positional_index = defaultdict(lambda: defaultdict(list))

for doc_id, chapter in enumerate(chapters):
    tokens = word_tokenize(chapter)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    normalized_tokens = normalize_text(filtered_tokens)
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in normalized_tokens]

    for position, token in enumerate(lemmatized_tokens):
        positional_index[token][doc_id].append(position)

The positional_index is a nested dictionary where the first key is the term, the second key is the document ID, and the value is a list of positions where the term appears in the document.

In [16]:
positional_index["fellowship"]

defaultdict(list,
            {14: [136],
             16: [0],
             341: [3],
             342: [266, 280],
             345: [32],
             484: [337],
             574: [343],
             575: [471],
             780: [515],
             794: [52],
             801: [1070],
             833: [480]})

### Posting List Node and Class

In [8]:
# Step 7: Define Positional Posting List Node and Positional Posting List classes
class PositionalPostingListNode:
    def __init__(self, doc_id, positions):
        self.doc_id = doc_id
        self.positions = positions
        self.next = None
        self.skip = None

class PositionalPostingList:
    def __init__(self):
        self.head = None

    def add_document(self, doc_id, positions):
        new_node = PositionalPostingListNode(doc_id, positions)
        if not self.head:
            self.head = new_node
        else:
            current = self.head
            while current.next:
                current = current.next
            current.next = new_node

    def add_skip_pointers(self, skip_length):
        current = self.head
        while current:
            skip_node = current
            for _ in range(skip_length):
                if skip_node:
                    skip_node = skip_node.next
                else:
                    break
            current.skip = skip_node
            current = current.next

    def search(self, doc_id):
        current = self.head
        while current:
            if current.doc_id == doc_id:
                return True
            if current.skip and current.skip.doc_id <= doc_id:
                current = current.skip
            else:
                current = current.next
        return False


### Skip pointers

In [9]:
# Step 8: Create Positional Posting Lists with Skip Pointers
positional_posting_lists = {}

for term, postings in positional_index.items():
    positional_posting_list = PositionalPostingList()
    for doc_id, positions in postings.items():
        positional_posting_list.add_document(doc_id, positions)
    positional_posting_list.add_skip_pointers(3)
    positional_posting_lists[term] = positional_posting_list



From the previous notebook, we can see that the term "fellowship" is present in these documents: [14, 16, 341, 342, 345, 484, 574, 575, 780, 794, 801, 833]

If you search for any of these documents, it should return "True" if you search for "fellowship".

In [10]:
# Example usage: Search for a term in a document
term_to_search = "fellowship"
doc_id_to_search = 14

if term_to_search in positional_posting_lists:
    result = positional_posting_lists[term_to_search].search(doc_id_to_search)
    print(f"Term '{term_to_search}' found in document {doc_id_to_search}: {result}")
else:
    print(f"Term '{term_to_search}' not found in any document.")

Term 'fellowship' found in document 14: True
