In [1]:
class Sequence_Node:
    def __init__(self, seq, id):
        self.seq = seq
        self.id = id

In [2]:
class Sequence_Node_Set:
    def __init__(self, id):
        self.id = id
        self.members = set()
        
    def add_member(self, member):
        self.members.add(member)

In [3]:
class Reference_Pair:
    def __init__(self, pair_id, seq1, seq2, num_seq):
        self.pair_id = pair_id
        self.seq1 = seq1
        self.seq2 = seq2
        
        self.node_sets = [] 
        for i in range(2, num_seq):
            self.node_sets.append(Sequence_Node_Set(i))
            
        self.tree_list = []

        
    

In [4]:
def read_file_to_list(filename):
    try:
        # Open the file in read mode
        with open(filename, 'r') as file:
            # Read all lines from the file and store them in a list
            lines = file.readlines()
        # Remove newline characters from each line and strip any leading/trailing whitespaces
        lines = [line.strip() for line in lines]
        
        return lines
    except FileNotFoundError:
        print(f"Error: File '{filename}' not found.")
        return []
    except Exception as e:
        print(f"Error: {e}")
        return []

In [5]:
def hamming_distance(str1, str2):

    return sum(bit1 != bit2 for bit1, bit2 in zip(str1, str2))
    

### Inputs ###

In [6]:
l = 4
d = 1

input_filename = 'minimal_test.txt'


### Step 1: Node Selection ###

In [7]:

_2d = 2 * d

sequence_list = read_file_to_list(input_filename)
num_seq = len(sequence_list)

reference_seq_1 = sequence_list[0]
reference_seq_2 = sequence_list[1]
reference_pair_list = []
reference_pair_counter = 0
    
for i in range(len(reference_seq_1) - l + 1):
    for j in range(len(reference_seq_2) - l + 1):
        if hamming_distance(reference_seq_1[i:i+l], reference_seq_2[j:j+l]) <= _2d:
            ref_sub_seq = reference_seq_1[i:i+l]
            ref_sub_seq2 = reference_seq_2[j:j+l]
            reference_pair = Reference_Pair(pair_id=reference_pair_counter, seq1=ref_sub_seq, seq2=ref_sub_seq2, num_seq=num_seq)

            
            count_seqs_with_nodes = 0
            for k in range(2, num_seq):                
                has_nodes = False
                for m in range(len(sequence_list[k]) - l + 1):
                    sub_seq = sequence_list[k][m:m+l]
                    if hamming_distance(sub_seq, ref_sub_seq) <= _2d and hamming_distance(sub_seq, reference_seq_2) <= _2d:
                        seq_node = Sequence_Node(seq=sub_seq, id=k)
                        reference_pair.node_sets[k-2].add_member(seq_node)
                        has_nodes = True
                if has_nodes:
                    count_seqs_with_nodes += 1
            
            if count_seqs_with_nodes == num_seq - 2:
                reference_pair_list.append(reference_pair)
                reference_pair_counter += 1
                print("Reference Pair ", reference_pair.pair_id)
                

            




Reference Pair  0
Reference Pair  1
Reference Pair  2


In [8]:
for pair in reference_pair_list:
    print("Reference Pair", pair.pair_id, ": ", pair.seq1, ", ", pair.seq2)
    print()

    for node_set in pair.node_sets:
        print("Sequence ", node_set.id, "'s nodes for pair ", pair.pair_id, ": ")
        for node in node_set.members:
            print(node.seq)





Reference Pair 0 :  TTGA ,  TTGA

Sequence  2 's nodes for pair  0 : 
TAGA
Sequence  3 's nodes for pair  0 : 
GCGA
Sequence  4 's nodes for pair  0 : 
GTGA
Reference Pair 1 :  GAGA ,  TTGA

Sequence  2 's nodes for pair  1 : 
TAGA
Sequence  3 's nodes for pair  1 : 
GCGA
Sequence  4 's nodes for pair  1 : 
GTGA
Reference Pair 2 :  GAGA ,  GACA

Sequence  2 's nodes for pair  2 : 
TAGA
Sequence  3 's nodes for pair  2 : 
GCGA
Sequence  4 's nodes for pair  2 : 
GTGA


### Step 2: Tree Construction ###

In [9]:
class Tree_Node:
    def __init__(self, seq, seq_id, depth=0):
        self.seq = seq
        self.seq_id = seq_id
        self.children = []
        self.depth = depth
        self.parent = None

    def add_child(self, child_node):
        child_node.parent = self
        child_node.depth = self.depth + 1
        self.children.append(child_node)
        

    def remove_child(self, child_node):
        if child_node in self.children:
            child_node.parent = None
            self.children.remove(child_node)

In [10]:
class Tree:
    def __init__(self, root):
        self.root = Tree_Node(root.seq, root.id)
        self.current_leaves = []
        self.height = 0
        self.id = 0
    
    def save_leaf(self, leaf):
        self.current_leaves.append(leaf)
        
    def unsave_leaf(self, leaf):
        if leaf in self.current_leaves:
            self.current_leaves.remove(leaf)

In [11]:
from collections import deque
import copy

In [12]:
def extendable(node, tree, ith_seq):
    queue = deque()
    if hamming_distance(tree.root.seq, node.seq) <= _2d:
        queue.append(tree.root)
        print("Root:", tree.root.seq)
        print("Node to add: ", node.seq, " from sequence ", ith_seq)
        iteration = 0
        while len(queue) > 0 and queue[0].depth < ith_seq - 3:
            
            print("Queue contents at iteration ", iteration, ":")
            for i in queue:
                print(i.seq)
            
            print("while loop check")
            front = queue.popleft()
            for child in front.children:
                if hamming_distance(child.seq, node.seq) <= _2d:
                    queue.append(child)
            
            iteration += 1
        
        
    print("Queue contents returned:")
    for i in queue:
        print(i.seq)

    return queue
        

In [13]:
def prune_branches(tree):
    for leaf in tree.current_leaves:
        if leaf.depth < tree.height:
            current_node = leaf
            num_siblings = len(current_node.parent.children) - 1
            while num_siblings == 0:
                current_node = current_node.parent
                num_siblings = len(current_node.parent.children) - 1
            parent = current_node.parent
            parent.remove_child(current_node)

In [14]:
tree_count = 0
for pair in reference_pair_list:
    node_sets = pair.node_sets
    for node in node_sets[0].members:
        root = Sequence_Node(seq=node.seq, id=node.id)
        tree = Tree(root)
        
        for i in range(3, num_seq):
            flag = False
            for node in node_sets[i-2].members:
                branches = extendable(node, tree, i)
                if branches:
                    print("branches check")
                    for branch in branches:
                        new_leaf = Tree_Node(seq=node.seq, seq_id=node.id)
                        branch.add_child(new_leaf)
                        tree.save_leaf(new_leaf)
                        tree.unsave_leaf(branch)
                    flag = True
            if not flag:
                tree = None
                break
            tree.height += 1
            prune_branches(tree)
        if tree:    
            pair.tree_list.append(tree)
            tree.id = tree_count
            tree_count += 1

    
            
            
        
        

Root: TAGA
Node to add:  GCGA  from sequence  3
Queue contents returned:
TAGA
branches check
Root: TAGA
Node to add:  GTGA  from sequence  4
Queue contents at iteration  0 :
TAGA
while loop check
Queue contents returned:
GCGA
branches check
Root: TAGA
Node to add:  GCGA  from sequence  3
Queue contents returned:
TAGA
branches check
Root: TAGA
Node to add:  GTGA  from sequence  4
Queue contents at iteration  0 :
TAGA
while loop check
Queue contents returned:
GCGA
branches check
Root: TAGA
Node to add:  GCGA  from sequence  3
Queue contents returned:
TAGA
branches check
Root: TAGA
Node to add:  GTGA  from sequence  4
Queue contents at iteration  0 :
TAGA
while loop check
Queue contents returned:
GCGA
branches check


In [15]:
def find_paths(node, path, paths):
    if node is None:
        return
    path.append(node.seq)
    if not node.children:
        paths.append(path)
    
    for child in node.children:
        find_paths(child, path, paths)
        
    
    
    return paths

In [16]:
for pair in reference_pair_list:
    print("Reference Pair ", pair.pair_id, ": ", pair.seq1, ", ", pair.seq2)
    print("Number of trees: ", len(pair.tree_list))
    
    for tree in pair.tree_list:
        print("Tree ID: ", tree.id)
        print("Tree height:", tree.height)
        path, paths = [], []
        tree_paths = find_paths(tree.root, path, paths)
        print("Number of paths in this tree: ", len(tree_paths))
        for tree_path in tree_paths:
            print("Length of path: ", len(tree_path))
            for node in tree_path:
                print(node)

Reference Pair  0 :  TTGA ,  TTGA
Number of trees:  1
Tree ID:  0
Tree height: 2
Number of paths in this tree:  1
Length of path:  3
TAGA
GCGA
GTGA
Reference Pair  1 :  GAGA ,  TTGA
Number of trees:  1
Tree ID:  1
Tree height: 2
Number of paths in this tree:  1
Length of path:  3
TAGA
GCGA
GTGA
Reference Pair  2 :  GAGA ,  GACA
Number of trees:  1
Tree ID:  2
Tree height: 2
Number of paths in this tree:  1
Length of path:  3
TAGA
GCGA
GTGA
