In [1]:
# Suffix tree node
class SuffixTreeNode:
    def __init__(self):
        self.children = {}
        self.suffix_link = None
        self.start = None
        self.end = None
        self.suffix_index = None
        self.P_u = set()
        self.S_u = set()
    
    def edge_length(self):
        return self.end[0] - self.start + 1

In [2]:
class SuffixTree:
    def __init__(self, alphabet):
        self.root = None
        self.last_new_node = None

        # Active Point
        self.active_node = None
        self.active_edge = -1
        self.active_length = 0

        # Remaining suffix
        self.remainder = 0

        self.leaf_end = [-1]
        self.root_end = None
        self.split_end = None
        self.size = -1

        self.alphabet = alphabet

    def _get_new_node(self, start, end, string_number):
        node = SuffixTreeNode()

        for c in self.alphabet:
            node.children[c] = None

        node.suffix_link = self.root
        node.start = start
        node.end = end
        node.suffix_index = [-1, string_number]

        return node
    
    # Walk down procedure
    def walk_down(self, current_node):
        edge_length = current_node.edge_length()
        if (self.active_length >= edge_length):
            self.active_edge += edge_length
            self.active_length -= edge_length
            self.active_node = current_node
            return 1
        return 0

    # Extention procedure
    def extend_suffix_tree(self, position, text, string_number):
        self.leaf_end[0] = position
        self.remainder += 1
        self.last_new_node = None

        while self.remainder > 0:
            if self.active_length == 0:
                self.active_edge = position

            if self.active_node.children[text[self.active_edge]] == None:
                self.active_node.children[text[self.active_edge]] = self._get_new_node(position, self.leaf_end, string_number)
                if self.last_new_node != None:
                    self.last_new_node.suffix_link = self.active_node
                    self.last_new_node = None

            else:
                next_node = self.active_node.children[text[self.active_edge]]
                if self.walk_down(next_node):
                    continue

                if text[next_node.start + self.active_length] == text[position]:
                    if self.last_new_node != None and self.active_node != self.root:
                        self.last_new_node.suffix_link = self.active_node
                        self.last_new_node = None

                    self.active_length += 1
                    break

                self.split_end = next_node.start + self.active_length - 1

                split = self._get_new_node(next_node.start, [self.split_end], string_number)
                self.active_node.children[text[self.active_edge]] = split

                split.children[text[position]] = self._get_new_node(position, self.leaf_end, string_number)
                next_node.start += self.active_length
                split.children[text[next_node.start]] = next_node

                if self.last_new_node != None:
                    self.last_new_node.suffix_link = split

                self.last_new_node = split

            self.remainder -= 1

            if self.active_node == self.root and self.active_length > 0:
                self.active_length -= 1
                self.active_edge = position - self.remainder + 1
            elif self.active_node != self.root:
                self.active_node = self.active_node.suffix_link
    
    # Building suffix tree
    def build_suffix_tree(self, text, string_number):
        self.size = len(text)
        print(self.size)
        
        if self.root == None:
            self.root = self._get_new_node(-1, self.root_end, string_number)
            self.root_end = [-1]
        
        self.active_node = self.root
        for i in range(self.size):
            self.extend_suffix_tree(i, text, string_number)
        
        label_height = 0
        self.set_suffix_index_by_dfs(self.root, label_height, string_number)
    
    def build_generalized_suffix_tree(self, strings):
        self.strings = strings
        for i, text in enumerate(self.strings):
            self.build_suffix_tree(text, i)
    
    def set_suffix_index_by_dfs(self, n, label_height, string_number):
        if n == None:
            return
        
        if n.start != -1:
            print(self.strings[string_number][n.start:n.end[0] + 1], end="")
        
        leaf = 1
        for c in self.alphabet:
            if n.children[c] != None:
                child_node = n.children[c]
                if leaf == 1 and n.start != -1:
                    print(" [%d, %d]\n"%(n.suffix_index[0], n.suffix_index[1]))
                leaf = 0
                self.set_suffix_index_by_dfs(n.children[c], label_height + n.children[c].edge_length(), string_number)
        
        if leaf == 1:
            # Leaf node
            n.suffix_index = [self.size - label_height, string_number]
            print(" [%d, %d]\n" % (n.suffix_index[0], n.suffix_index[1]))

In [3]:
# Testing - Check 
alphabet = '#$ACGT'
text1 = 'AGCCTC$'
text2 = 'CTCAGC#'
t = SuffixTree(alphabet)
t.build_generalized_suffix_tree([text1, text2])

7
$ [6, 0]

AGCCTC$ [0, 0]

C [-1, 0]

$ [5, 0]

CTC$ [2, 0]

TC$ [3, 0]

GCCTC$ [1, 0]

TC$ [4, 0]

7
 [-1, 1]

# [6, 1]

TCAGC# [1, 1]

# [6, 1]

 [-1, 1]

AGC# [3, 1]

CTCAGC# [0, 1]

C [-1, 0]

# [5, 1]

# [5, 1]

AGC# [2, 0]

CAGC# [-3, 1]

GC# [3, 0]

TCAGC# [-3, 1]

 [-1, 1]

GC# [4, 1]

TCAGC# [1, 1]

GC# [4, 1]



In [19]:
t.root.P_u

set()