In [1]:
class TreeNode(object):
    def __init__(self,label=None,seq=None,parent=None):
        self.label = label
        self.seq = seq
        self.left = None
        self.right = None
        self.parent = parent
        
class build_tree(object):
    def __init__(self,tree):
        self.tree = tree
        self.read_newick(self.tree)
    
    def read_newick(self,tree):
        self.root = TreeNode()
        node = self.root
        label = ''
        sep = ['(', ')', ',', ';']

        for i in tree:
            if i in sep:
                if label:
                    label,dist = label.split(':')
                    node.dist = dist
                    if label:
                        node.label,node.seq = label.split('_')
                label = ''

                if i == '(':
                    next_node = TreeNode(parent=node)
                    node.left = next_node
                    node = next_node

                elif i == ')':
                    node = node.parent
                elif i == ',':
                    node = node.parent
                    next_node = TreeNode(parent=node)
                    node.right = next_node
                    node = next_node
            else:
                label += i
                
    def assign_internal_node_seq(self):
        def helper(node):
            if not node.left.seq:
                helper(node.left)
            if not node.right.seq:
                helper(node.right)
            
            node.seq = ''
            for i in range(len(node.left.seq)):
                a = node.left.seq[i]
                b = node.right.seq[i]
                
                if a == b:
                    node.seq += a
                else:
                    node.seq += '1'
                
        helper(self.root)
        
    def is_subtree(self,nodes):
        self.num_of_leaf = None
        
        def helper(nodes,root):
            if not self.num_of_leaf:
                if not root.left or not root.right:
                    return (1,int(root.label in nodes))

                n_leaf_left,n_nodes_left = helper(nodes,root.left)
                n_leaf_right,n_nodes_right = helper(nodes,root.right)
                n_leaf = n_leaf_left + n_leaf_right
                n_nodes = n_nodes_left + n_nodes_right
                
                if n_nodes == len(nodes) and not self.num_of_leaf:
                    self.num_of_leaf = n_leaf

                return (n_leaf,n_nodes)
            else:
                return (0,0)
        
        helper(nodes,self.root)
        return self.num_of_leaf
    
    def mutation_summary(self):
        def summarize_diff(s,s_parent):
            return [(i,s[i],s_parent[i]) for i in range(len(s)) if s_parent[i]=="1"]
        
        def find_tree_depth(node,tree_depth):
            if not node:
                return (-1,-1)
            
            node.tree_depth = tree_depth
            left_min_height,left_max_height = find_tree_depth(node.left,tree_depth+1)
            right_min_height,right_max_height = find_tree_depth(node.right,tree_depth+1)
            node.min_height = min(left_min_height,right_min_height) + 1
            node.max_height = max(left_max_height,right_max_height) + 1
            return (node.min_height,node.max_height)
        
        def helper(node):
            if not node:
                return
            
            helper(node.left)
            helper(node.right)
            
            node.is_leaf = 1
            if node.left or node.right:
                node.is_leaf = 0
            
            if node.parent:
                diff = summarize_diff(node.seq,node.parent.seq)
                node_df = pd.DataFrame(data={"pos":[i[0] for i in diff],"state":[i[1] for i in diff],\
                                       "parent_state":[i[2] for i in diff]},dtype="int")
                node_df["seq"] = node.seq
                node_df["is_leaf"] = node.is_leaf
                node_df["tree_depth"] = node.tree_depth
                node_df["min_height"] = node.min_height
                node_df["max_height"] = node.max_height
                self.mutation_table = pd.concat([self.mutation_table,node_df])
        
        find_tree_depth(self.root,0)
        self.mutation_table = pd.DataFrame()
        helper(self.root)

In [2]:
import pandas as pd
import os
import string

df = pd.read_csv("../../Data/Subchallenge1/sub1_train_1.txt",sep="\t",dtype="str")
tree = open("../../Data/Subchallenge1/groundTruth_train/sub1_train_1.nwk","r").read()

In [4]:
tree = build_tree(tree.strip())
tree.assign_internal_node_seq()
tree.mutation_summary()

In [4]:
mutation_table = pd.DataFrame()

for i in range(76):
    tree_file = os.path.join("../../Data/Subchallenge1/groundTruth_train","sub1_train_%d.nwk"%(i+1))
    tree = open(tree_file,"r").read().strip()
    tree = build_tree(tree)
    tree.assign_internal_node_seq()
    tree.mutation_summary()
    tree.mutation_table["sample"] = "sub1_train_%d"%(i+1)
    mutation_table = pd.concat([mutation_table,tree.mutation_table])
#     print(len(mutation_table))

mutation_table.to_csv("../../Cache/Subchallenge1/mutation_table.csv",index=False)

FileNotFoundError: [Errno 2] No such file or directory: '../../Cache/Subchallenge1/mutation_table.csv'

In [5]:
mutation_table

Unnamed: 0,pos,state,parent_state,seq,is_leaf,tree_depth,min_height,max_height,sample
0,1,0,1,2012212021,1,4,0,0,sub1_train_1
1,2,1,1,2012212021,1,4,0,0,sub1_train_1
2,5,1,1,2012212021,1,4,0,0,sub1_train_1
3,9,1,1,2012212021,1,4,0,0,sub1_train_1
0,1,1,1,2112212021,1,4,0,0,sub1_train_1
1,2,1,1,2112212021,1,4,0,0,sub1_train_1
2,5,1,1,2112212021,1,4,0,0,sub1_train_1
3,9,1,1,2112212021,1,4,0,0,sub1_train_1
0,1,1,1,2112212021,0,3,1,1,sub1_train_1
1,2,1,1,2112212021,0,3,1,1,sub1_train_1


In [6]:
mutation_df = pd.DataFrame()

for i in range(76):
    tree_file = os.path.join("../../Data/Subchallenge1/groundTruth_train","sub1_train_%d.nwk"%(i+1))
    tree = open(tree_file,"r").read()
    tree = build_tree(tree)
    df_file = os.path.join("../../Data/Subchallenge1/","sub1_train_%d.txt"%(i+1))
    df = pd.read_csv(df_file,sep="\t",dtype="str")
    
    m = len(df["state"][0])
    n = len(df)

    clusters = {}
    labels = df["cell"]
    seqs = df["state"]

    for i in range(m):
        seq = [j[i] for j in seqs]
        for j in range(n):
            key = (i,seq[j])
            if key not in clusters:
                clusters[key] = [labels[j]]
            else:
                clusters[key].append(labels[j])

    for c in clusters:
        if len(clusters[c])>1 and len(clusters[c])<n:
            n_leaf = tree.is_subtree(clusters[c])
            mutation_df = pd.concat([mutation_df,pd.DataFrame(data={"sample":["sub1_train_%d"%(i+1)],"state":[c[1]],\
                                    "pos":[c[0]],"n_labels":[len(clusters[c])],"n_leaf":[n_leaf]})])

In [5]:
mutation_df.to_csv("../../Cache/Subchallenge1/mutation.csv",index=False)