In [20]:
import spacy
import re
from util.load_lotr import lotr_chapters

nlp = spacy.load("en_core_web_sm")

In [21]:
def extract_proper_nouns(text):
    doc = nlp(text)
    proper_nouns = [token.text for token in doc if token.pos_ == "PROPN"]
    return proper_nouns

class BTreeNode:
    def __init__(self, leaf=False):
        self.leaf = leaf
        self.keys = []
        self.locations = []
        self.children = []

class BTree:
    def __init__(self, t):
        self.root = BTreeNode(True)
        self.t = t  # Minimum degree

    def insert(self, key, location):
        root = self.root
        if len(root.keys) == (2 * self.t) - 1:
            new_root = BTreeNode()
            self.root = new_root
            new_root.children.insert(0, root)
            self._split_child(new_root, 0)
            self._insert_non_full(new_root, key, location)
        else:
            self._insert_non_full(root, key, location)

    def _insert_non_full(self, node, key, location):
        i = len(node.keys) - 1
        if node.leaf:
            while i >= 0 and key < node.keys[i]:
                i -= 1
            if i + 1 < len(node.keys) and key == node.keys[i + 1]:
                node.locations[i + 1].append(location)
            else:
                node.keys.insert(i + 1, key)
                node.locations.insert(i + 1, [location])
        else:
            while i >= 0 and key < node.keys[i]:
                i -= 1
            i += 1
            if len(node.children[i].keys) == (2 * self.t) - 1:
                self._split_child(node, i)
                if key > node.keys[i]:
                    i += 1
            self._insert_non_full(node.children[i], key, location)

    def _split_child(self, parent, index):
        t = self.t
        child = parent.children[index]
        new_node = BTreeNode(child.leaf)
        parent.keys.insert(index, child.keys[t - 1])
        parent.locations.insert(index, child.locations[t - 1])
        parent.children.insert(index + 1, new_node)
        new_node.keys = child.keys[t:(2 * t - 1)]
        new_node.locations = child.locations[t:(2 * t - 1)]
        child.keys = child.keys[0:(t - 1)]
        child.locations = child.locations[0:(t - 1)]
        if not child.leaf:
            new_node.children = child.children[t:(2 * t)]
            child.children = child.children[0:t]

    def search(self, key):
        return self._search_recursive(self.root, key)

    def _search_recursive(self, node, key):
        i = 0
        while i < len(node.keys) and key > node.keys[i]:
            i += 1
        if i < len(node.keys) and key == node.keys[i]:
            return node.locations[i]
        if node.leaf:
            return None
        return self._search_recursive(node.children[i], key)

    def wildcard_search(self, pattern):
        results = {}
        nodes = [self.root]
        while nodes:
            current_node = nodes.pop()
            for i, key in enumerate(current_node.keys):
                if self._matches_pattern(key, pattern):
                    if key in results:
                        results[key].extend(current_node.locations[i])
                    else:
                        results[key] = current_node.locations[i]
            if not current_node.leaf:
                nodes.extend(current_node.children)
        return results

    def _matches_pattern(self, term, pattern):
        # Convert wildcard pattern to a regex pattern
        regex_pattern = pattern.replace('*', '.*')
        regex_pattern = '^' + regex_pattern + '$'
        return re.fullmatch(regex_pattern, term, re.IGNORECASE) is not None

In [22]:
chapters = lotr_chapters()

b_tree = BTree(t=3)

for chapter_idx, chapter in enumerate(chapters):
    proper_nouns = extract_proper_nouns(chapter)
    for noun in proper_nouns:
        b_tree.insert(noun, chapter_idx + 1)  # Using chapter index as location

In [23]:
print(b_tree.wildcard_search("Ga*"))  # Output: Locations where terms like 'Galadriel', 'Gandalf' appear
print(b_tree.wildcard_search("*dor"))  # Output: Locations where terms like 'Gondor', 'Mordor' appear
print(b_tree.wildcard_search("Hob*"))  # Output: Locations where terms like 'Hobbit' appear

{'gate': [26, 277, 288, 298, 718], 'galad': [171, 177, 226, 226, 226, 226, 234, 177, 177, 226, 171, 172, 226, 43, 175, 226], 'gah': [529], 'Gaffer': [22, 50, 517, 546, 820, 820, 820, 561, 583, 820, 517, 531, 597, 52, 52, 516, 516, 531, 774, 52, 52, 50, 50, 52, 22, 22, 22, 28, 50, 22, 22, 26, 22, 22, 22, 22, 22], 'Gandalf': [427, 506, 653, 682, 742, 778, 795, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 802, 803, 803, 809, 809, 809, 802, 803, 809, 795, 802, 803, 779, 781, 781, 793, 796, 796, 779, 781, 781, 778, 779, 781, 752, 754, 776, 778, 779, 779, 752, 753, 778, 743, 749, 752, 690, 718, 722, 726, 730, 733, 743, 746, 802, 724, 724, 731, 718, 722, 724, 718, 718, 718, 718, 718, 718, 718, 718, 718, 718, 718, 718, 684, 689, 690, 690, 718, 718, 685, 687, 690, 682, 683, 686, 659, 681, 681, 681, 681, 681, 682, 682, 720, 681, 681, 681, 681, 681, 681, 664, 672, 673, 681, 681, 681, 666, 672, 681, 660, 664, 667, 653, 654, 654, 655, 660, 664, 653, 654, 655, 653, 653, 654, 509, 646, 648, 652,