In [1]:
import math
import logging
from pathlib import Path

FORMAT = '[%(name)s:%(levelname)s]  %(message)s'
logging.basicConfig(level=logging.DEBUG, format=FORMAT)
logger = logging.getLogger('dbg')

def dprint(s):
    logger.debug(s)

def iprint(s):
    logger.info(s)

logger.setLevel(logging.INFO)


In [2]:
class Node:

    def __init__(self, keys=None, children=None, is_leaf=True):
        self.keys = keys if keys else []
        self.children = children if children else []
        self.is_leaf = is_leaf

    def is_full(self, t):
        """Check if the node is full (i.e. has 2t-1 keys). """
        return len(self.keys) == 2 * t - 1

    def __str__(self):
        summary = f"Node({self.keys})"
        if self.children:
            summary += f" with {len(self.children)} children"
        return summary

class Btree:

    def __init__(self, t, root=None, verbose=False):
        """Create a B-tree with minimum degree t.
        """
        self.t = t
        self.root = root
        self.verbose = verbose

## B-trees

**Degree `t` implies `t` to `2t` children**

B-Trees are a self-balancing search-trees with fast search, insertion and deletion, where nodes can have many children.

**Worst case Search/Insert/Delete** $\Rightarrow O( \log n )$, storage is $\Theta (n)$

<img src="media/btree.png" alt="drawing" width="450"/>

The key idea is to optimize for realistic memory hierarchy by making the number of children as large as possible, but so that each node remains within a single disc block and requires just one memory access. A shallow-wide tree requires few accesses to main memory. For a tree with 200 children (199 keys per node) depth three contains $1 + 200 + 200^2 + 200^3 = 8,040,201$ nodes $= 1,600,000,000$ Keys!.

High branching factor $\Rightarrow$ Reduced h $\Rightarrow$ Fewer disk accesses

### Definition

When a B-tree has $t=2$ and allows key counts of 1 to 3 supporting nodes of 2 to 4, it is called a **2-4** or **2-3-4** Tree.

A B-tree with minimum degree `t`:
- Nodes have attributes `keys` [list] and `is_leaf`.
- Internal nodes have `len(keys) + 1` children where:
- The keys of the node **separate its children**.
- $ t - 1\leq $ node `keys` $ \leq 2t -1$ (except for the root). 
- $ t \leq $ `children` $ \leq 2t$ (except for the root). 
- All leaves have the same depth. (ideally balanced)

### Tree Size and Shape

A B-tree with height $h$, min degree $t$, key count $n$:

- All leaves have the same depth
- All nodes have $t - 1\leq $ node `keys` $ \leq 2t -1$

The maximum number of nodes in degree t _fat_ tree grows (root + level 1 + level 2...) according to:

$ 1 + 2 + 2 \cdot 2t + 2 \cdot 2t \cdot 2t ...$ $ = 1 + 2 + 2 \cdot (2t) + 2 \cdot (2t)^2 = 1 + a + ar + ar^2$

$ n \leq 1 + 2 ( \frac{(2t)^h - 1}{2t - 1} )$

Where h is 1 for a root node with a solitary pair of children.

The minimum number of nodes in degree t _skinny_ tree grows (root + level 1 + level 2...) according to:

$ 1 + 2 + 2 \cdot t + 2 \cdot t \cdot t ...$ $ = 1 + 2 + 2 \cdot t + 2 \cdot t^2 = 1 + a + ar + ar^2$

$ n \geq  1 + 2 ( \frac{t^h - 1}{t - 1} )$

The number of keys held by n nodes in a _skinny_ tree (as each key holds t - 1 bar the root):

$ k \geq 1 + (t-1)2 ( \frac{t^h - 1}{t - 1} ) = 1 + 2(t^h - 1) = 2t^h - 1$

Therefore, where $n$ is the number of keys:

$ n \geq 2t^h - 1$, $ \frac{1}{2}(n + 1) \geq t^h$

$ h \leq \log_t \left( \frac{1}{2}(n + 1) \right)$

$ h_{max} = \lfloor \log_t \left( \frac{1}{2}(n + 1) \right) \rfloor$



### Tree Search 

Requires a linear scan through each node's keys, but could be done with a binary search faster.

Remembering the tree parameters (for a tree min degree $t$, with total key count $n$):

- $ t - 1\leq $ node `keys` $ \leq 2t -1$ (except for the root). 
- $ t \leq $ `children` $ \leq 2t$ (except for the root).

We found that tree heigh was $O( \log_t n) \equiv O( \log n)$.

Linear scan $O(t)$ per node, $\Rightarrow$ $O(t \log n)$ **Search**.

Binary Search $O(\log_2 t)$ per node, $\Rightarrow$ $O(\log_2 n)$ **Search**.

In [3]:
def search(self, u, key):
    """Find the node containing key in the subtree rooted at u
    Returns:
        The node containing key, together with the position of the key
        or None if key is not in the tree.
    """
    # linear scan to find index of key
    i = 0
    while i < len(u.keys) and key > u.keys[i]:
        i += 1
    if i < len(u.keys) and key == u.key[i]:
        return (u, i)
    if u.is_leaf:
        return None
    return self.search(u.children[i], key)

setattr(Btree, "search", search)

### Tree Insertion

Two ways of handling insertion violations:
1. `Insert-Then-Fix` - Insert at leaf then reverse back up tree and resolve.
2. `Fix-Then-Insert` - Split full nodes on way down in 1 pass soln.

(1) Need to search for correct leaf node and insert key. If node is full, split into tow nodes at median and move median to parent as seperator, same goes for parent.

#### Complexity

Insert CPU complexity is $O( t \log_t n)$

In [4]:
def insert(self, key):
    """Insert key into the B-tree."""
    root = self.root
    if root.is_full(t=self.t):  # root has 2t - 1 keys
        root = self.split_root()
    self.insert_not_full(root, key)


def split_root(self):
    """ Split the root of the B-tree. """
    new_root = Node()
    new_root.is_leaf = False
    new_root.children = [self.root]
    self.root = new_root
    self.split_child(new_root, 0)
    return new_root

def insert_not_full(self, u: Node, key):
    """Insert key into the subtree rooted at u, which is assumed
    to be not full. """
    i = 0
    while i < len(u.keys) and key > u.keys[i]:
        i += 1
    if u.is_leaf:
        u.keys.insert(i, key)
    else:
        if u.children[i].is_full(t=self.t):
            self.split_child(u, i)
            i = i if key <= u.keys[i] else i+ 1
        self.insert_not_full(u.children[i], key)

def split_child(self, u: Node, i: int):
    """Split the child of u at index i. """
    t = self.t
    full_node = u.children[i]
    new_node = Node()
    new_node.is_leaf = full_node.is_leaf
    new_node.keys = full_node.keys[t:]
    if not full_node.is_leaf:
        new_node.children = full_node.children[t:]
    u.children.insert(i + 1, new_node)
    u.keys.insert(i, full_node.keys[t - 1])  # median
    full_node.keys = full_node.keys[:t - 1]
    full_node.children = full_node.children[:t]


setattr(Btree, "insert", insert)
setattr(Btree, "split_root", split_root)
setattr(Btree, "insert_not_full", insert_not_full)
setattr(Btree, "split_child", split_child)


### Tree Deletion

Deletion iterates through a `Fix-Then-Delete` node descent in one pass:

- Goal is to search through nodes and delete a key
- Only call delete on nodes with $\geq$ $t$ keys (safe to delete 1)
- Therefore, a key might need to transferred *down to a child* before we can continue

Three Cases when calling delete recursively:
1. Reach a leaf 
2. Internal node containing target key
3. Internal node **not** containing target key

#### Complexity

Delete is $O( t \log_t n)$ for a linear scan search (one pass). In practice (large $t$) the majority of deletes are leafs. 

In [5]:

def delete(self, u: Node, key):
    assert len(u.keys) >= self.t  or u == self.root, (
        "The node u must have at least t keys or be the root"
    )
    i = 0
    while i < len(u.keys) and key > u.keys[i]:
        i += 1

    # handle cases
    if u.is_leaf:  # case 1
        if i < len(u.keys) and key == u.keys[i]:
            u.keys.pop(i)
        else:
            raise KeyError(f"Key {key} not found in the tree")
        return
    # u is not a leaf
    if i < len(u.children)-1: #ignas
        #print(i,len(u.children))
        pass
    if i < len(u.keys) and key == u.keys[i]:  # case 2
        if len(u.children[i].keys) >= self.t:  # case 2a
            # pred_key = self.predecessor(key, u.children[i])
            pred_key = self.maximum(u.children[i])
            self.delete(u.children[i], pred_key)
            u.keys[i] = pred_key
        elif len(u.children[i+1].keys) >= self.t: # case 2b
            # succ_key = self.successor(key, u.children[i+1])
            # succ_key = self.minimum(key, u.children[i+1])
            succ_key = self.minimum(u.children[i+1])
            self.delete(u.children[i+1], succ_key)
            u.keys[i] = succ_key
        else:  # case 2c, both children have t-1 keys
            self.merge_children(u, i)
            if u == self.root and not u.keys:
                # height of the tree
                self.root = u.children[0]
            self.delete(u.children[i], key)
    else: # case 3 (key not in u)
        if len(u.children[i].keys) >= self.t:
            self.delete(u.children[i], key)  # recurse
        elif self.has_sibling_with_at_least_t_keys(u, i): # case 3a
            j = self.index_of_sibling_with_at_least_t_keys(u, i)
            if j == i + 1:  # right sibling has at least t keys
                u.children[i].keys.append(u.keys[i])
                u.keys[i] = u.children[j].keys.pop(0)
                if not u.children[j].is_leaf:
                    u.children[i].children.append(u.children[j].children.pop(0))
            else:  # left sibling has at least t keys
                u.children[i].keys.insert(0, u.keys[j])
                u.keys[j] = u.children[j].keys.pop()
                if not u.children[j].is_leaf:
                    u.children[i].children.insert(0, u.children[j].children.pop())
            self.delete(u.children[i], key)
        else: # u is not a leaf and both siblings have t-1 keys
            if i > 0:  # we merge with left sibling
                self.merge_children(u, i - 1)
                i -= 1 # we now have one less child, so we shift over
            else:  # we merge with right sibling
                self.merge_children(u, i)
            if u == self.root and not u.keys:
                # reduce height of the tree
                self.root = u.children[0]
            self.delete(u.children[i], key)

def has_sibling_with_at_least_t_keys(self, u: Node, i: int):
    """Check if child i of u has a sibling with at least t keys.

    Args:
        u: the parent node.
        i: the index of the child to check.

    Returns:
        True if child i of u has a sibling with at least t keys.
    """
    left_sibling_has_at_least_t_keys = i > 0 and len(u.children[i - 1].keys) >= self.t
    right_sibling_has_at_least_t_keys = (i < len(u.children) - 1 and
                                            len(u.children[i+1].keys) >= self.t)
    return left_sibling_has_at_least_t_keys or right_sibling_has_at_least_t_keys

def index_of_sibling_with_at_least_t_keys(self, u: Node, i: int):
    """Compute the index of the sibling of u with at least t keys.

    Args:
        u: the parent node.
        i: the index of the child of u.

    Returns:
        The index of the sibling of u with at least t keys.
    """
    if i > 0 and len(u.children[i-1].keys) >= self.t:
        return i - 1
    if i < len(u.children) - 1 and len(u.children[i+1].keys) >= self.t:
        return i + 1
    raise ValueError("No sibling of u has at least t keys")

def merge_children(self, u: Node, i: int):
    """Merge the children of u at index i and i+1.

    Args:
        u: the parent node.
        i: the index of the first child to merge.
    """
    median_key = u.keys.pop(i)
    u.children[i].keys.append(median_key)
    u.children[i].keys.extend(u.children[i+1].keys)
    if not u.children[i].is_leaf:
        u.children[i].children.extend(u.children[i+1].children)
    u.children.pop(i+1)


setattr(Btree, "delete", delete)
setattr(Btree, "has_sibling_with_at_least_t_keys", has_sibling_with_at_least_t_keys)
setattr(Btree, "index_of_sibling_with_at_least_t_keys", index_of_sibling_with_at_least_t_keys)
setattr(Btree, "merge_children", merge_children)


### Traversals, Min Max

In [6]:

def inorder(self, node: Node):
    """Perform an inorder traversal of the B-tree.

    Args:
        node: Node - the root of the tree to traverse.
    """
    for i in range(len(node.keys)):
        if not node.is_leaf:
            self.inorder(node.children[i])
        print(node.keys[i], end=" ")

    # don't forget the last child (there are more children than keys)
    if not node.is_leaf:
        self.inorder(node.children[-1])

def preorder(self, node: Node):
    """Perform a preorder traversal of the B-tree.

    Args:
        node: Node - the root of the tree to traverse.
    """
    for key in node.keys:
        print(key, end=" ")
    if not node.is_leaf:
        for child in node.children:
            self.preorder(child)

def postorder(self, node: Node):
    """Perform a postorder traversal of the B-tree.

    Args:
        node: Node - the root of the tree to traverse.
    """
    if not node.is_leaf:
        for child in node.children:
            self.postorder(child)
    for key in node.keys:
        print(key, end=" ")

def minimum(self, node: Node) -> Node:
    """Find the minimum key in the subtree rooted at node.

    Args:
        node: the root of the subtree to search.

    Returns:
        The minimum key in the tree rooted at node.
    """
    while not node.is_leaf:
        node = node.children[0]
    return node.keys[0]

def maximum(self, node: Node) -> Node:
    """Find the maximum key in the subtree rooted at node.

    Args:
        node: the root of the subtree to search.

    Returns:
        The maximum key in the subtree.
    """
    while not node.is_leaf:
        node = node.children[-1]
    return node.keys[-1]

setattr(Btree, "inorder", inorder)
setattr(Btree, "preorder", preorder)
setattr(Btree, "postorder", postorder)
setattr(Btree, "minimum", minimum)
setattr(Btree, "maximum", maximum)

### GraphViz Visualisation

In [7]:

def viz_btree(self, dest_path: Path, refresh: bool = False):
    # We only perform the import here to prevent people from having to install
    # graphviz if they don't want to visualise the tree.
    import pygraphviz as pgv

    if dest_path.exists() and not refresh:
        print(f"Visualisation already exists at {dest_path}, skipping")
        return

    def key_str(keys):
        return " , ".join([str(key) for key in keys])

    def render_preorder(node, parent, pgv_graph):
        pgv_graph.add_node(key_str(node.keys), shape="rectangle", style="filled",
                            fillcolor="#fcf0cf")
        if parent is not None:
            pgv_graph.add_edge(key_str(parent.keys), key_str(node.keys))
        for child in node.children:
            render_preorder(node=child, parent=node, pgv_graph=pgv_graph)

    # Create a new Graph object
    pgv_graph = pgv.AGraph(directed=False)
    render_preorder(node=self.root, parent=None, pgv_graph=pgv_graph)
    print(f"Saving visualisation to {dest_path}")
    pgv_graph.draw(dest_path, prog="dot")

setattr(Btree, "viz_btree", viz_btree)


### Puttin it all together

In [8]:
def main():

    btree = Btree(t=2, root=Node(is_leaf=True))
    insert_keys = [5, 3, 2, 7, 1, 8, 9, 12, 13, 4, 0, 6, -1, 19, 24, 25, -2, -3, -4, -5]
    print("Keys to be inserted:")
    print(insert_keys)
    for key in insert_keys:
        btree.insert(key)

    # print out traversals
    print(f"Inorder traversal")
    btree.inorder(btree.root)
    print("")
    print(f"Preorder traversal")
    btree.preorder(btree.root)
    print("")
    print(f"Postorder traversal")
    btree.postorder(btree.root)
    print("")

    dest_path = Path("btree_.png")
    dest_path.parent.mkdir(parents=True, exist_ok=True)
    btree.viz_btree(dest_path=dest_path, refresh=True)

    keys_to_delete = [2, 5, 6, 7, 0, 1, 3, 4, 8, 9, 12, 13, 19, 24, 25]
    print("Keys to be deleted:")
    print(keys_to_delete)
    for key in keys_to_delete:
        btree.delete(btree.root, key)


    dest_path = Path("btree-after-deletions_.png")
    dest_path.parent.mkdir(parents=True, exist_ok=True)
    btree.viz_btree(dest_path=dest_path, refresh=True)

    print(f"Print out minimum and maximum values")
    print(f"Minimum key: {btree.minimum(btree.root)}")
    print(f"Maximum key: {btree.maximum(btree.root)}")

    """
    Print out:

    Keys to be inserted:
    [5, 3, 2, 7, 1, 8, 9, 12, 13, 4, 0, 6, -1, 19, 24, 25, -2, -3, -4, -5]
    Inorder traversal
    -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 8 9 12 13 19 24 25 
    Preorder traversal
    1 7 -3 -1 -5 -4 -2 0 3 2 4 5 6 9 13 8 12 19 24 25 
    Postorder traversal
    -5 -4 -2 0 -3 -1 2 4 5 6 3 8 12 19 24 25 9 13 1 7 
    Saving visualisation to figs/btree.png
    Keys to be deleted:
    [2, 5, 6, 7, 0, 1, 3, 4, 8, 9, 12, 13, 19, 24, 25]
    Saving visualisation to figs/btree-after-deletions.png
    Print out minimum and maximum values
    Minimum key: -5
    Maximum key: -1
    """
    
main()


Keys to be inserted:
[5, 3, 2, 7, 1, 8, 9, 12, 13, 4, 0, 6, -1, 19, 24, 25, -2, -3, -4, -5]
Inorder traversal
-5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 8 9 12 13 19 24 25 
Preorder traversal
1 7 -3 -1 -5 -4 -2 0 3 2 4 5 6 9 13 8 12 19 24 25 
Postorder traversal
-5 -4 -2 0 -3 -1 2 4 5 6 3 8 12 19 24 25 9 13 1 7 
Saving visualisation to btree_.png
Keys to be deleted:
[2, 5, 6, 7, 0, 1, 3, 4, 8, 9, 12, 13, 19, 24, 25]
Saving visualisation to btree-after-deletions_.png
Print out minimum and maximum values
Minimum key: -5
Maximum key: -1
