In [502]:
import numpy as np
import pandas as pd
import datetime
from tqdm import tqdm_notebook

In [503]:
version = "25589581"
dotdata = pd.read_csv("OneZoomTree{}.dot".format(version), sep='\t', index_col=0, header=None, names=["id","c1","c2"])
metadata = pd.read_csv("OneZoomNodes{}.nodes".format(version), sep='\t', index_col=0, header=None, names=["id", "real_node", "ntips"])
namedata = pd.read_csv("OneZoomNames{}.nodes".format(version), sep='\t', index_col=0, header=None, keep_default_na=False, names=["id", "name"])
namedata = namedata[~pd.to_numeric(namedata.index, "coerce").isnull()] #remove leaves from name data
namedata.index = namedata.index.astype(np.int)

In [504]:
len(dotdata), len(metadata), len(namedata)

(2123178, 2123178, 2123178)

In [505]:
#add the real_node and ntips information
df = dotdata.join(metadata)
df = df.join(namedata[~pd.to_numeric(namedata.index, "coerce").isnull()])
df['parent'] = -1
df['cut_score'] = 0
df['chunk'] = 0

In [506]:
mapping = pd.DataFrame.from_dict({
    "child":pd.to_numeric(np.concatenate([df.c1.values,df.c2.values]), "coerce"),
    "parent":np.concatenate([df.index,df.index])})
mapping = mapping[~pd.isna(mapping.child)]
mapping['child'] = mapping.child.values.astype(np.int)

In [None]:
df.loc[mapping.child,'parent'] = mapping.parent.values
df.parent = df.parent.astype(int)
assert sum(df.parent == -1)==1 #check there is only one root

In [None]:
def recalc_tips_and_scores(idx):
    n_removed = df.loc[idx].ntips
    while idx >=0:
        df.loc[idx,'ntips'] -= (n_removed - 1) # Subtract one to account for the current node turning into a leaf
        df.loc[idx,'cut_score'] = make_score(df.loc[[idx]]).values[0]
        idx = df.loc[idx,'parent']
    return n_removed

# A stack-based alternative to recursive tree traversal.
def children(idx):
    stack = [idx]
    while stack:
        node = stack.pop()
        yield node
        for child in reversed(df.loc[node,['c1','c2']]):
            #only descend into children if we haven't chunked them already
            if df.loc[node,'chunk']==0:
                try:
                    stack.append(int(child))
                except ValueError:
                    pass

def make_score(rows):
    target_chunksize, min_chunksize, max_chunksize = 1000, 500, 2000
    return ((rows.ntips >= min_chunksize) & (rows.ntips <= max_chunksize)) * (\
        1/((rows.ntips - target_chunksize) ** 2 + 1) + \
        np.where(rows.name, 2, 0))

In [None]:
df.cut_score = make_score(df)
print(df.cut_score.idxmax())
df.sort_values("cut_score", ascending=False)

67314


Unnamed: 0_level_0,c1,c2,real_node,ntips,name,parent,cut_score,chunk
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
67314,67315,67355,T,1000,Conoidasida,67163,3.000000,0
1437827,1437828,1438039,T,1000,Agathidinae,1437825,3.000000,0
1700054,1700055,1700284,T,1001,Molophilus,1699760,2.500000,0
795262,795263,795403,T,999,Anguilliformes,795233,2.500000,0
1321967,1321968,1322354,T,999,Cicadinae,1321966,2.500000,0
909668,909669,909713,T,1002,Cephalopoda,909636,2.200000,0
1291731,1291732,1291768,T,998,Reduvioidea,1291730,2.200000,0
75262,75263,75317,T,1002,Hypotrichia,75039,2.200000,0
912343,912344,912356,T,1003,Palaeoheterodonta,912342,2.100000,0
595265,595266,595327,T,1004,Lichinales,595261,2.058824,0


In [None]:
df.chunk = 0
chunk = 1
while (np.any(df.chunk == 0)):
    tm = datetime.datetime.now()
    best = df.cut_score.idxmax()
    print(
        "Cutting on node {}{}:".format(
            best, ((" ("+str(df.loc[best,'name'])+")") if df.loc[best,'name'] else "")),
        end=" ", flush=True)
    n_in_chunk = recalc_tips_and_scores(best)
    print(n_in_chunk, "leaves", end=" ", flush=True)
    descendants = np.array([x for x in children(best)], dtype=np.int)
    assert len(descendants) == n_in_chunk, "{} not {}".format(len(descendants), n_in_chunk)
    assert df.loc[best,'ntips'] == 1
    df.loc[best,'ntips'] == 0 # The cut off node is now a leaf - has no descendant leaves
    df.loc[descendants,'chunk'] = chunk
    df.loc[descendants,'cut_score'] = np.NaN # Make sure we never pick nodes that are in a chunk
    left = sum(df.chunk == 0)
    print("allocated to chunk {} in {} seconds: {} left ({:.2f}%)".format(
        chunk,
        (datetime.datetime.now()-tm).total_seconds(),
        left,
        left/len(df)*100))
    chunk += 1

Cutting on node 67314 (Conoidasida): 1000 leaves allocated to chunk 1 in 2.831994 seconds: 2122178 left (99.95%)
Cutting on node 1437827 (Agathidinae): 1000 leaves allocated to chunk 2 in 4.771545 seconds: 2121178 left (99.91%)
Cutting on node 795262 (Anguilliformes): 999 leaves allocated to chunk 3 in 3.267666 seconds: 2120179 left (99.86%)
Cutting on node 1321967 (Cicadinae): 999 leaves allocated to chunk 4 in 4.641149 seconds: 2119180 left (99.81%)
Cutting on node 1700054 (Molophilus): 1001 leaves allocated to chunk 5 in 4.742397 seconds: 2118179 left (99.76%)
Cutting on node 75262 (Hypotrichia): 1002 leaves allocated to chunk 6 in 3.00216 seconds: 2117177 left (99.72%)
Cutting on node 909668 (Cephalopoda): 1002 leaves allocated to chunk 7 in 2.863357 seconds: 2116175 left (99.67%)
Cutting on node 1291731 (Reduvioidea): 998 leaves allocated to chunk 8 in 3.855379 seconds: 2115177 left (99.62%)
Cutting on node 912343 (Palaeoheterodonta): 1003 leaves allocated to chunk 9 in 2.883181 s

Cutting on node 1941355 (Leptocircini): 973 leaves allocated to chunk 72 in 13.013006 seconds: 2051392 left (96.62%)
Cutting on node 642686 (Diatrypaceae): 1028 leaves allocated to chunk 73 in 14.694496 seconds: 2050364 left (96.57%)
Cutting on node 941533 (Helicoidea): 972 leaves allocated to chunk 74 in 4.224851 seconds: 2049392 left (96.52%)
Cutting on node 859048 (4612_): 1029 leaves allocated to chunk 75 in 6.687231 seconds: 2048363 left (96.48%)
Cutting on node 856688 (3063_): 1030 leaves allocated to chunk 76 in 4.553148 seconds: 2047333 left (96.43%)
Cutting on node 1596537 (Lampyrinae): 1030 leaves allocated to chunk 77 in 9.101004 seconds: 2046303 left (96.38%)
Cutting on node 512233 (Mucoromycotina): 969 leaves allocated to chunk 78 in 3.285518 seconds: 2045334 left (96.33%)
Cutting on node 603217 (Phyllachora): 1031 leaves allocated to chunk 79 in 10.450655 seconds: 2044303 left (96.29%)
Cutting on node 215766 (Bambuseae): 968 leaves allocated to chunk 80 in 4.816679 second

Cutting on node 227381 (Allium): 941 leaves allocated to chunk 143 in 5.003279 seconds: 1980970 left (93.30%)
Cutting on node 626194 (Pyrenulales): 1059 leaves allocated to chunk 144 in 13.298473 seconds: 1979911 left (93.25%)
Cutting on node 659644 (Microthyriales): 1060 leaves allocated to chunk 145 in 7.46251 seconds: 1978851 left (93.20%)
Cutting on node 1531170 (Staphylininae): 940 leaves allocated to chunk 146 in 10.907725 seconds: 1977911 left (93.16%)
Cutting on node 828501 (Microhyloidea): 939 leaves allocated to chunk 147 in 4.357957 seconds: 1976972 left (93.11%)
Cutting on node 2083414 (Nolinae): 939 leaves allocated to chunk 148 in 11.816717 seconds: 1976033 left (93.07%)
Cutting on node 1123067 (Cypridocopina): 1062 leaves allocated to chunk 149 in 5.614699 seconds: 1974971 left (93.02%)
Cutting on node 1252634 (Thripoidea): 938 leaves allocated to chunk 150 in 5.026567 seconds: 1974033 left (92.98%)
Cutting on node 906939 (Megascolecidae): 1063 leaves allocated to chunk 

Cutting on node 1387897 (Cercerini): 906 leaves allocated to chunk 214 in 5.024072 seconds: 1909574 left (89.94%)
Cutting on node 157119 (Athyriaceae): 905 leaves allocated to chunk 215 in 3.120847 seconds: 1908669 left (89.90%)
Cutting on node 320734 (Passifloraceae): 1097 leaves allocated to chunk 216 in 4.006746 seconds: 1907572 left (89.85%)
Cutting on node 1896789 (Archipini): 1097 leaves allocated to chunk 217 in 7.617615 seconds: 1906475 left (89.79%)
Cutting on node 477149 (Oleaceae): 902 leaves allocated to chunk 218 in 4.371819 seconds: 1905573 left (89.75%)
Cutting on node 1029 (Parcubacteria group): 1099 leaves allocated to chunk 219 in 2.731427 seconds: 1904474 left (89.70%)
Cutting on node 558430 (Psathyrella): 901 leaves allocated to chunk 220 in 4.017309 seconds: 1903573 left (89.66%)
Cutting on node 680406 (Sphaeropsis): 901 leaves allocated to chunk 221 in 6.148055 seconds: 1902672 left (89.61%)
Cutting on node 703250 (Megasporaceae): 901 leaves allocated to chunk 222