### IMPORTS ###

In [1]:
from collections import deque
import copy

 ### CLASSES ###

In [2]:
class Sequence_Node:
    def __init__(self, seq, id):
        self.seq = seq
        self.id = id

In [3]:
class Sequence_Node_Set:
    def __init__(self, id):
        self.id = id
        self.members = set()
        
    def add_member(self, member):
        self.members.add(member)

In [4]:
class Reference_Pair:
    def __init__(self, pair_id, seq1, seq2, num_seq):
        self.pair_id = pair_id
        self.seq1 = seq1
        self.seq2 = seq2
        
        self.node_sets = [] 
        for i in range(2, num_seq):
            self.node_sets.append(Sequence_Node_Set(i))
            
        self.tree_list = []

        
    

In [5]:
class Tree_Node:
    def __init__(self, seq, seq_id, depth=0):
        self.seq = seq
        self.seq_id = seq_id
        self.children = []
        self.depth = depth
        self.parent = None

    def add_child(self, child_node):
        child_node.parent = self
        child_node.depth = self.depth + 1
        self.children.append(child_node)
        

    def remove_child(self, child_node):
        if child_node in self.children:
            child_node.parent = None
            self.children.remove(child_node)

In [6]:
class Tree:
    def __init__(self, root):
        self.root = Tree_Node(root.seq, root.id)
        self.current_leaves = []
        self.height = 0
        self.id = 0
    
    def save_leaf(self, leaf):
        self.current_leaves.append(leaf)
        
    def unsave_leaf(self, leaf):
        if leaf in self.current_leaves:
            self.current_leaves.remove(leaf)

### FUNCTIONS ###

In [7]:
def read_file_to_list(filename):
    try:
        # Open the file in read mode
        with open(filename, 'r') as file:
            # Read all lines from the file and store them in a list
            lines = file.readlines()
        # Remove newline characters from each line and strip any leading/trailing whitespaces
        lines = [line.strip() for line in lines]
        
        return lines
    except FileNotFoundError:
        print(f"Error: File '{filename}' not found.")
        return []
    except Exception as e:
        print(f"Error: {e}")
        return []

In [8]:
def hamming_distance(str1, str2):

    return sum(bit1 != bit2 for bit1, bit2 in zip(str1, str2))
    

In [9]:
def find_paths(node, path, paths):
    if node is None:
        return
    path.append(node.seq)
    if not node.children:
        paths.append(path)
    
    for child in node.children:
        branch_path = copy.deepcopy(path)
        find_paths(child, branch_path, paths)
        
    
    
    return paths

In [10]:
def extendable(node, tree, ith_seq):
    queue = deque()
    if hamming_distance(tree.root.seq, node.seq) <= _2d:
        queue.append(tree.root)
        # print("Root:", tree.root.seq)
        # print("Node to add: ", node.seq, " from sequence ", ith_seq)
        iteration = 0
        while len(queue) > 0 and queue[0].depth < ith_seq - 3:
            
            # print("Queue contents at iteration ", iteration, ":")
            # for i in queue:
            #     print(i.seq)
            
            # print("while loop check")
            front = queue.popleft()
            for child in front.children:
                if hamming_distance(child.seq, node.seq) <= _2d:
                    queue.append(child)
            
            iteration += 1
        
        
    # print("Queue contents returned:")
    # for i in queue:
    #     print(i.seq)

    return queue
        

In [11]:
def prune_branches(tree):
    for leaf in tree.current_leaves:
        if leaf.depth < tree.height:
            current_node = leaf
            num_siblings = len(current_node.parent.children) - 1
            branch_list = [leaf.seq]
            while num_siblings == 0:
                current_node = current_node.parent
                num_siblings = len(current_node.parent.children) - 1
                branch_list.append(current_node.seq)
            parent = current_node.parent
            parent.remove_child(current_node)
            tree.unsave_leaf(leaf)
            print("Tree ID pruned: ", tree.id)
            print()
            print("branch ", branch_list, " pruned from sequence ", parent.seq_id)
            print()

### Inputs ###

In [12]:
l = 11
d = 4

# minimal_test.txt consensus string: GAGA
input_filename = 'random_test.txt'


### Step 1: Node Selection ###

In [13]:

_2d = 2 * d

sequence_list = read_file_to_list(input_filename)
num_seq = len(sequence_list)

reference_seq_1 = sequence_list[0]
reference_seq_2 = sequence_list[1]
reference_pair_set = set()
reference_pair_list = []
reference_pair_counter = 0
    
for i in range(len(reference_seq_1) - l + 1):
    for j in range(len(reference_seq_2) - l + 1):
        if hamming_distance(reference_seq_1[i:i+l], reference_seq_2[j:j+l]) <= _2d:
            ref_sub_seq = reference_seq_1[i:i+l]
            ref_sub_seq2 = reference_seq_2[j:j+l]
            if (ref_sub_seq, ref_sub_seq2) in reference_pair_set:
                break
            reference_pair = Reference_Pair(pair_id=reference_pair_counter, seq1=ref_sub_seq, seq2=ref_sub_seq2, num_seq=num_seq)
            
            print("refpair candidate: ", ref_sub_seq, ref_sub_seq2)

            
            count_seqs_with_nodes = 0
            for k in range(2, num_seq):                
                has_nodes = False
                for m in range(len(sequence_list[k]) - l + 1):
                    sub_seq = sequence_list[k][m:m+l]
                    if hamming_distance(sub_seq, ref_sub_seq) <= _2d and hamming_distance(sub_seq, ref_sub_seq2) <= _2d:
                        seq_node = Sequence_Node(seq=sub_seq, id=k)
                        reference_pair.node_sets[k-2].add_member(seq_node)
                        has_nodes = True
                if has_nodes:
                    count_seqs_with_nodes += 1
            
            if count_seqs_with_nodes == num_seq - 2:
                reference_pair_list.append(reference_pair)
                reference_pair_set.add((ref_sub_seq, ref_sub_seq2))
                reference_pair_counter += 1
                print("Reference Pair ", reference_pair.pair_id)
                

            




refpair candidate:  ACGGGATCGAT AAGCTTCCGAT
Reference Pair  0
refpair candidate:  ACGGGATCGAT TTCCGATCGGC
Reference Pair  1
refpair candidate:  ACGGGATCGAT TCCGATCGGCA
Reference Pair  2
refpair candidate:  CGGGATCGATC AAGCTTCCGAT
Reference Pair  3
refpair candidate:  CGGGATCGATC AGCTTCCGATC
Reference Pair  4
refpair candidate:  CGGGATCGATC TCCGATCGGCA
Reference Pair  5
refpair candidate:  GGGATCGATCT AAGCTTCCGAT
Reference Pair  6
refpair candidate:  GGGATCGATCT AGCTTCCGATC
Reference Pair  7
refpair candidate:  GGGATCGATCT GCTTCCGATCG
Reference Pair  8
refpair candidate:  GGATCGATCTA AGCTTCCGATC
Reference Pair  9
refpair candidate:  GGATCGATCTA GCTTCCGATCG
Reference Pair  10
refpair candidate:  GGATCGATCTA CTTCCGATCGG
refpair candidate:  GATCGATCTAG AAGCTTCCGAT
refpair candidate:  GATCGATCTAG GCTTCCGATCG
Reference Pair  11
refpair candidate:  GATCGATCTAG CTTCCGATCGG
Reference Pair  12
refpair candidate:  GATCGATCTAG TTCCGATCGGC
Reference Pair  13
refpair candidate:  ATCGATCTAGC AAGCTTCC

### TEST NODE SELECTION ###

In [14]:
print("Reference pair set: ")
for pair in reference_pair_set:
    print(pair)

for pair in reference_pair_list:
    print("Reference Pair", pair.pair_id, ": ", pair.seq1, ", ", pair.seq2)
    print()

    for node_set in pair.node_sets:
        print("Sequence ", node_set.id, "'s nodes for pair ", pair.pair_id, ": ")
        for node in node_set.members:
            print(node.seq)





Reference pair set: 
('CGGGATCGATC', 'TCCGATCGGCA')
('CGGGATCGATC', 'AGCTTCCGATC')
('GGATCGATCTA', 'GCTTCCGATCG')
('ACGGGATCGAT', 'TCCGATCGGCA')
('CGGGATCGATC', 'AAGCTTCCGAT')
('ATCGATCTAGC', 'TTCCGATCGGC')
('GGGATCGATCT', 'AAGCTTCCGAT')
('GGATCGATCTA', 'AGCTTCCGATC')
('ACGGGATCGAT', 'AAGCTTCCGAT')
('GATCGATCTAG', 'GCTTCCGATCG')
('GATCGATCTAG', 'CTTCCGATCGG')
('ACGGGATCGAT', 'TTCCGATCGGC')
('GGGATCGATCT', 'GCTTCCGATCG')
('ATCGATCTAGC', 'TCCGATCGGCA')
('ATCGATCTAGC', 'CTTCCGATCGG')
('GATCGATCTAG', 'TTCCGATCGGC')
('GGGATCGATCT', 'AGCTTCCGATC')
Reference Pair 0 :  ACGGGATCGAT ,  AAGCTTCCGAT

Sequence  2 's nodes for pair  0 : 
AAACGCTCCTA
ACGCTCCTAGC
AACGCTCCTAG
Sequence  3 's nodes for pair  0 : 
AAGCTCGATAG
TCGATAGGCTT
CTCGATAGGCT
Sequence  4 's nodes for pair  0 : 
CCGCTTAGGCT
TAGGCTAGGCT
Sequence  5 's nodes for pair  0 : 
TGCTTAACGTT
Reference Pair 1 :  ACGGGATCGAT ,  TTCCGATCGGC

Sequence  2 's nodes for pair  1 : 
ACGCTCCTAGC
CGCTCCTAGCT
AAACGCTCCTA
Sequence  3 's nodes for pair  1

### Step 2: Tree Construction ###

In [15]:
def merge(new_clique, cliques):
    new_cliques = []
    for prev_clique in cliques:
        
        match = True
        for seq_1 in prev_clique:
            for seq_2 in new_clique:
                if hamming_distance(seq_1, seq_2) > _2d:
                    match = False
                    break
        if not match:
            break
        
        merged_clique = copy.deepcopy(prev_clique)
        for seq in clique:
            if seq not in new_clique:
                merged_clique.append(seq)
        new_cliques.append(merged_clique)
    new_cliques.append(new_clique)
    cliques.extend(new_cliques)

In [16]:
tree_count = 0
merged_cliques = []
for pair in reference_pair_list:
    node_sets = pair.node_sets
    for root in node_sets[0].members:
        root_node = Sequence_Node(seq=root.seq, id=root.id)
        tree = Tree(root_node)
        
        for i in range(3, num_seq):
            flag = False
            for node in node_sets[i-2].members:
                branches = extendable(node, tree, i)
                if branches:
                    # print("branches check")
                    for branch in branches:
                        new_leaf = Tree_Node(seq=node.seq, seq_id=node.id)
                        branch.add_child(new_leaf)
                        tree.save_leaf(new_leaf)
                        tree.unsave_leaf(branch)
                    flag = True
            if not flag:
                tree = None
                break
            tree.height += 1
            prune_branches(tree)
        if tree:    
            pair.tree_list.append(tree)
            tree.id = tree_count
            tree_count += 1
            
            one_tree_cliques = find_paths(tree.root, [], [])
            # print("Tree ID: ", tree.id)
            # clique_count = 0
            for clique in one_tree_cliques:
                # print("clique number: ", clique_count)
                # clique_count += 1
                merge(clique, merged_cliques)
            
                        
                
            
                
            
            
            
        
        

Tree ID pruned:  0

branch  ['AAGCTCGATAG']  pruned from sequence  2

Tree ID pruned:  0

branch  ['TTAGGCTAGGC', 'CTCGATAGGCT']  pruned from sequence  2

Tree ID pruned:  0

branch  ['AAAGCTCGATA']  pruned from sequence  2

Tree ID pruned:  0

branch  ['TAGGCTAGGCT', 'AAAGCTCGATA']  pruned from sequence  2

Tree ID pruned:  0

branch  ['CCGCTTAGGCT']  pruned from sequence  3

Tree ID pruned:  0

branch  ['TAGGCTAGGCT']  pruned from sequence  3

Tree ID pruned:  0

branch  ['AAGCTCGATAG']  pruned from sequence  2



### TEST TREE CONSTRUCTION ###

In [17]:
print(len(merged_cliques))
for motif in merged_cliques:
    print(len(motif))

34
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
3
4
4
4
4
4
4
4
4
4
4
4
4
4
4


In [18]:
for pair in reference_pair_list:
    print("Reference Pair ", pair.pair_id, ": ", pair.seq1, ", ", pair.seq2)
    print("Number of trees: ", len(pair.tree_list))
    
    for tree in pair.tree_list:
        print("Tree ID: ", tree.id)
        print("Tree height:", tree.height)
        path, paths = [], []
        tree_paths = find_paths(tree.root, path, paths)
        print("Number of paths in this tree: ", len(tree_paths))
        for tree_path in tree_paths:
            print("Length of path: ", len(tree_path))
            for node in tree_path:
                print(node)

Reference Pair  0 :  ACGGGATCGAT ,  AAGCTTCCGAT
Number of trees:  0
Reference Pair  1 :  ACGGGATCGAT ,  TTCCGATCGGC
Number of trees:  3
Tree ID:  0
Tree height: 3
Number of paths in this tree:  1
Length of path:  4
ACGCTCCTAGC
GCTCGATAGGC
TTAGGCTAGGC
CGCTGCTTAAC
Tree ID:  1
Tree height: 3
Number of paths in this tree:  1
Length of path:  4
CGCTCCTAGCT
GCTCGATAGGC
TTAGGCTAGGC
CGCTGCTTAAC
Tree ID:  2
Tree height: 3
Number of paths in this tree:  1
Length of path:  4
AAACGCTCCTA
GCTCGATAGGC
TTAGGCTAGGC
CGCTGCTTAAC
Reference Pair  2 :  ACGGGATCGAT ,  TCCGATCGGCA
Number of trees:  2
Tree ID:  3
Tree height: 3
Number of paths in this tree:  5
Length of path:  4
CGCTCCTAGCT
CTCGATAGGCT
CCGCTTAGGCT
GCTGCTTAACG
Length of path:  4
CGCTCCTAGCT
CTCGATAGGCT
CCGCTTAGGCT
TGCTTAACGTT
Length of path:  4
CGCTCCTAGCT
CTCGATAGGCT
TAGGCTAGGCT
GCTGCTTAACG
Length of path:  4
CGCTCCTAGCT
CTCGATAGGCT
TAGGCTAGGCT
TGCTTAACGTT
Length of path:  4
CGCTCCTAGCT
CTCGATAGGCT
TTAGGCTAGGC
GCTGCTTAACG
Tree ID:  4
Tree hei

In [19]:

## same as above with less prints ##

for pair in reference_pair_list:
    print("Reference Pair ", pair.pair_id, ": ", pair.seq1, ", ", pair.seq2)
    
    for tree in pair.tree_list:
        print("Tree ID: ", tree.id)
        path, paths = [], []
        tree_paths = find_paths(tree.root, path, paths)
        for tree_path in tree_paths:
            print()
            print("Motif: ")
            for node in tree_path:
                print(node)
            print()

Reference Pair  0 :  ACGGGATCGAT ,  AAGCTTCCGAT
Reference Pair  1 :  ACGGGATCGAT ,  TTCCGATCGGC
Tree ID:  0

Motif: 
ACGCTCCTAGC
GCTCGATAGGC
TTAGGCTAGGC
CGCTGCTTAAC

Tree ID:  1

Motif: 
CGCTCCTAGCT
GCTCGATAGGC
TTAGGCTAGGC
CGCTGCTTAAC

Tree ID:  2

Motif: 
AAACGCTCCTA
GCTCGATAGGC
TTAGGCTAGGC
CGCTGCTTAAC

Reference Pair  2 :  ACGGGATCGAT ,  TCCGATCGGCA
Tree ID:  3

Motif: 
CGCTCCTAGCT
CTCGATAGGCT
CCGCTTAGGCT
GCTGCTTAACG


Motif: 
CGCTCCTAGCT
CTCGATAGGCT
CCGCTTAGGCT
TGCTTAACGTT


Motif: 
CGCTCCTAGCT
CTCGATAGGCT
TAGGCTAGGCT
GCTGCTTAACG


Motif: 
CGCTCCTAGCT
CTCGATAGGCT
TAGGCTAGGCT
TGCTTAACGTT


Motif: 
CGCTCCTAGCT
CTCGATAGGCT
TTAGGCTAGGC
GCTGCTTAACG

Tree ID:  4

Motif: 
AACGCTCCTAG
CTCGATAGGCT
TAGGCTAGGCT
GCTGCTTAACG

Reference Pair  3 :  CGGGATCGATC ,  AAGCTTCCGAT
Tree ID:  5

Motif: 
ACGCTCCTAGC
TCGATAGGCTT
CCGCTTAGGCT
CTGCTTAACGT

Tree ID:  6

Motif: 
GCTCCTAGCTT
CTCGATAGGCT
TAGGCTAGGCT
CTGCTTAACGT


Motif: 
GCTCCTAGCTT
CTCGATAGGCT
CCGCTTAGGCT
CTGCTTAACGT


Motif: 
GCTCCTAGCTT
TCGATAG