In [1]:
import string

class TreeNode(object):
    def __init__(self,label=None,seq=None,parent=None):
        self.label = label
        self.seq = seq
        self.left = None
        self.right = None
        self.parent = parent
        
class build_tree(object):
    def __init__(self,tree,df):
        self.tree = tree
        self.df = df
        self.num_leaf = len(df)
        self.len_seq = len(df["seq"][0])
        self.read_newick(self.tree)
    
    def read_newick(self,tree):
        self.root = TreeNode()
        node = self.root
        label = ''
        sep = ['(', ')', ',', ';']

        for i in tree:
            if i in sep:
                if label:
                    node.label = label
                    node.seq = self.df[self.df["label"]==label]["seq"].tolist()[0]
                label = ''

                if i == '(':
                    next_node = TreeNode(parent=node)
                    node.left = next_node
                    node = next_node

                elif i == ')':
                    node = node.parent
                elif i == ',':
                    node = node.parent
                    next_node = TreeNode(parent=node)
                    node.right = next_node
                    node = next_node
            else:
                label += i
    
    def assign_internal_node_seq(self):
        def helper(node):
            if not node.left.seq:
                helper(node.left)
            if not node.right.seq:
                helper(node.right)
            
            node.seq = ''
            for i in range(len(node.left.seq)):
                a = node.left.seq[i]
                b = node.right.seq[i]
                
                if a == b:
                    node.seq += a
                elif a == '-':
                    node.seq += b
                elif b == '-':
                    node.seq += a
                else:
                    node.seq += '0'
                
        helper(self.root)
        
    def mutation_summary(self):
        def summarize_diff(s,s_parent):
            all_states = all_states = '0' + string.ascii_uppercase + 'abcd-'
            count = [0 for _ in all_states]
            for i in range(len(s)):
                if s[i] != s_parent[i]:
                    count[all_states.index(s[i])] += 1
            return count
        
        def find_tree_depth(node,tree_depth):
            if not node:
                return (-1,-1)
            
            node.tree_depth = tree_depth
            left_min_height,left_max_height = find_tree_depth(node.left,tree_depth+1)
            right_min_height,right_max_height = find_tree_depth(node.right,tree_depth+1)
            node.min_height = min(left_min_height,right_min_height) + 1
            node.max_height = max(left_max_height,right_max_height) + 1
            return (node.min_height,node.max_height)
        
        def helper(node):
            if not node:
                return
            
            helper(node.left)
            helper(node.right)
            
            node.is_leaf = 1
            if node.left or node.right:
                node.is_leaf = 0
            
            node.two_leaves = 0
            if node.is_leaf and node.parent.left.label and node.parent.right.label:
                node.two_leaves = 1

            if node.parent:
                node.mutation_count = summarize_diff(node.seq,node.parent.seq)
                node_df = pd.DataFrame(data={"seq":[node.seq],"is_leaf":[node.is_leaf],"two_leaves":[node.two_leaves],
                                    "mutation_to_parent":[" ".join(str(i) for i in node.mutation_count)],
                                    "tree_depth":[node.tree_depth],"min_height":[node.min_height],
                                    "max_height":[node.max_height]})
                self.mutation_table = pd.concat([self.mutation_table,node_df])
        
        find_tree_depth(self.root,0)
        self.mutation_table = pd.DataFrame()
        helper(self.root)
        
    def is_subtree(self,nodes):
        self.num_of_leaf = None
        
        def helper(nodes,root):
            if not self.num_of_leaf:
                if not root.left or not root.right:
                    return (1,int(root.label in nodes))

                n_leaf_left,n_nodes_left = helper(nodes,root.left)
                n_leaf_right,n_nodes_right = helper(nodes,root.right)
                n_leaf = n_leaf_left + n_leaf_right
                n_nodes = n_nodes_left + n_nodes_right
                
                if n_nodes == len(nodes) and not self.num_of_leaf:
                    self.num_of_leaf = n_leaf

                return (n_leaf,n_nodes)
            else:
                return (0,0)
        
        helper(nodes,self.root)
        return self.num_of_leaf

In [2]:
import pandas as pd
from collections import Counter
import os
import re

samples = [i.split(".")[0] for i in os.listdir("../../Data/Subchallenge2/SubC2_train_TXT/")]
deletion_table = pd.DataFrame()

for sample in samples:
    tree = open('../../Data/Subchallenge2/SubC2_train_REF/%s_REF.nw'%sample,'r').read()
    df = pd.read_table("../../Data/Subchallenge2/SubC2_train_TXT/%s.txt"%sample,\
                       header=None,names=["label","seq"])
    tree = build_tree(tree,df)
    tree.assign_internal_node_seq()
    
    deletions = pd.DataFrame(data={"label":[],"start":[],"end":[]})
    p = re.compile("-+")
    n = len(df)
    m = len(df["seq"][0])

    for i in range(n):
        s = df["seq"][i]
        for j in p.finditer(s):
            deletions = deletions.append({"label":df["label"][i],"start":j.start(),"end":j.end()},ignore_index=True)
    deletion_summary = deletions.groupby(["start","end"]).size().reset_index()
    deletion_summary.columns = ["start","end","count"]
    deletion_summary = deletion_summary[deletion_summary["count"]>=2].sort_values("count")
    
    n_labels = []
    n_leaf = []
    for i,row in deletion_summary.iterrows():
        start = row["start"]
        end = row["end"]
        cl_labels = deletions[(deletions["start"]==start) & (deletions["end"]==end)]
        cl_labels = cl_labels["label"].tolist()
        n_labels.append(len(cl_labels))
        n_leaf.append(tree.is_subtree(cl_labels))
    
    clusters = pd.DataFrame(data={"n_labels":n_labels,"n_leaf":n_leaf})
    clusters["sample"] = sample
    deletion_table = pd.concat([deletion_table,clusters])

In [4]:
deletion_table.to_csv("./deletion_table.csv")

In [5]:
import pandas as pd
from collections import Counter
import os

samples = [i.split(".")[0] for i in os.listdir("../../Data/Subchallenge2/SubC2_train_TXT/")]
mutation_table = pd.DataFrame()

for sample in samples:
    tree = open('../../Data/Subchallenge2/SubC2_train_REF/%s_REF.nw'%sample,'r').read()
    df = pd.read_table("../../Data/Subchallenge2/SubC2_train_TXT/%s.txt"%sample,\
                       header=None,names=["label","seq"])
    tree = build_tree(tree,df)
    tree.assign_internal_node_seq()
    tree.mutation_summary()
    tree.mutation_table['sample'] = sample
    mutation_table = pd.concat([mutation_table,tree.mutation_table])
mutation_table.to_csv("./mutation_table.csv",index=False)