In [17]:
import re
import nltk
from pathlib import Path

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
import string
from collections import defaultdict

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\leopu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\leopu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\leopu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\leopu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Choosing a document unit

In [20]:
# Step 1: Load the lotr.txt file
file_path = Path("../../lotr.txt")
with open(file_path, "r") as file:
    text = file.read()
    
print(text[:468])

chapters = text.split("\n\n")

Three Rings for the Elven-kings under the sky,
               Seven for the Dwarf-lords in their halls of stone,
            Nine for Mortal Men doomed to die,
              One for the Dark Lord on his dark throne
           In the Land of Mordor where the Shadows lie.
               One Ring to rule them all, One Ring to find them,
               One Ring to bring them all and in the darkness bind them
           In the Land of Mordor where the Shadows lie.
    


### Tokenization

In [5]:
# Step 2: Tokenize the text
tokens = word_tokenize(text)
print(tokens[:10])

['Three', 'Rings', 'for', 'the', 'Elven-kings', 'under', 'the', 'sky', ',', 'Seven']


### Stop words

In [14]:
# Step 3: Stop Word Removal
stop_words = set(stopwords.words('english'))
print(list(stop_words)[:10])

filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print(filtered_tokens[:10])

['theirs', "she'll", 'below', 'so', 'down', "we'll", 'ain', "i'd", 'have', "wasn't"]
['Three', 'Rings', 'Elven-kings', 'sky', ',', 'Seven', 'Dwarf-lords', 'halls', 'stone', ',']


### Normalization

In [15]:
# Step 4: Normalization
def normalize_text(tokens):
    # Convert to lowercase
    tokens = [word.lower() for word in tokens]
    # Remove punctuation
    tokens = [word for word in tokens if word not in string.punctuation]
    return tokens

normalized_tokens = normalize_text(filtered_tokens)
print(normalized_tokens[:10])

['three', 'rings', 'elven-kings', 'sky', 'seven', 'dwarf-lords', 'halls', 'stone', 'nine', 'mortal']


### Stemming and lemmatization

In [18]:
# Step 5: Stemming and Lemmatization
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

lemmatized_tokens = [lemmatizer.lemmatize(word) for word in normalized_tokens]
stemmed_tokens = [stemmer.stem(word) for word in normalized_tokens]

print("Lemmatized Tokens:", lemmatized_tokens[:10])
print("Stemmed Tokens:", stemmed_tokens[:10])

Lemmatized Tokens: ['three', 'ring', 'elven-kings', 'sky', 'seven', 'dwarf-lords', 'hall', 'stone', 'nine', 'mortal']
Stemmed Tokens: ['three', 'ring', 'elven-k', 'sky', 'seven', 'dwarf-lord', 'hall', 'stone', 'nine', 'mortal']


### Creating an inverted index

In [21]:
# Step 6: Create an Inverted Index
inverted_index = defaultdict(list)

for doc_id, chapter in enumerate(chapters):
    tokens = word_tokenize(chapter)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    normalized_tokens = normalize_text(filtered_tokens)
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in normalized_tokens]

    for position, token in enumerate(lemmatized_tokens):
        inverted_index[token].append((doc_id, position))

### Posting List Node and Class

In [22]:
# Step 7: Define Posting List Node and Posting List classes
class PostingListNode:
    def __init__(self, doc_id, position):
        self.doc_id = doc_id
        self.position = position
        self.next = None
        self.skip = None

class PostingList:
    def __init__(self):
        self.head = None

    def add_document(self, doc_id, position):
        new_node = PostingListNode(doc_id, position)
        if not self.head:
            self.head = new_node
        else:
            current = self.head
            while current.next:
                current = current.next
            current.next = new_node

    def add_skip_pointers(self, skip_length):
        current = self.head
        while current:
            skip_node = current
            for _ in range(skip_length):
                if skip_node:
                    skip_node = skip_node.next
                else:
                    break
            current.skip = skip_node
            current = current.next

    def search(self, doc_id):
        current = self.head
        while current:
            if current.doc_id == doc_id:
                return True
            if current.skip and current.skip.doc_id <= doc_id:
                current = current.skip
            else:
                current = current.next
        return False

### Skip pointers

Skip pointers are used to optimize the search process in a posting list, which is a fundamental data structure in information retrieval systems. Here are some reasons why skip pointers are beneficial:

- Faster Search: Skip pointers allow you to skip over sections of the posting list during a search, reducing the number of comparisons needed to find a specific document ID. This can significantly speed up the search process, especially for large posting lists.

- Efficient Traversal: By using skip pointers, you can traverse the posting list more efficiently. Instead of checking every single node, you can jump ahead by a certain number of nodes, which reduces the time complexity of the search operation.

- Balanced Performance: Skip pointers provide a balanced approach to improving search performance without significantly increasing the complexity of the data structure. They offer a good trade-off between the additional storage required for the skip pointers and the performance gains achieved during search operations.

- Scalability: As the size of the posting list grows, the benefits of using skip pointers become more pronounced. Skip pointers help maintain efficient search performance even as the dataset scales up.

- Improved Query Processing: In information retrieval systems, query processing often involves intersecting or merging multiple posting lists. Skip pointers can make these operations more efficient by reducing the number of nodes that need to be examined.

### Example Scenario:
Imagine you have a posting list with document IDs: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]. Without skip pointers, searching for a document ID would require checking each node sequentially until you find the desired ID.

With skip pointers added at intervals of 3, the posting list might look like this:

Node 1: skip pointer to Node 4
Node 2: skip pointer to Node 5
Node 3: skip pointer to Node 6
Node 4: skip pointer to Node 7
Node 5: skip pointer to Node 8
Node 6: skip pointer to Node 9
Node 7: skip pointer to Node 10
Node 8: skip pointer to Node 10
Node 9: skip pointer to Node 10
Node 10: no skip pointer
When searching for document ID 7, you can start at Node 1 and use the skip pointer to jump to Node 4. From Node 4, you can jump to Node 7 directly using the skip pointer, significantly reducing the number of nodes you need to examine.

In [31]:
# Step 8: Create Posting Lists with Skip Pointers
posting_lists = {}

for term, postings in inverted_index.items():
    posting_list = PostingList()
    for doc_id, position in postings:
        posting_list.add_document(doc_id, position)
    posting_list.add_skip_pointers(3)
    posting_lists[term] = posting_list

From the previous notebook, we can see that the term "fellowship" is present in these documents: [14, 16, 341, 342, 345, 484, 574, 575, 780, 794, 801, 833]

If you search for any of these documents, it should return "True" if you search for "fellowship".

In [40]:
# Example usage: Search for a term in a document
term_to_search = "fellowship"
doc_id_to_search = 14

if term_to_search in posting_lists:
    result = posting_lists[term_to_search].search(doc_id_to_search)
    print(f"Term '{term_to_search}' found in document {doc_id_to_search}: {result}")
else:
    print(f"Term '{term_to_search}' not found in any document.")

Term 'fellowship' found in document 14: True
