# Match cell size data with phylogeny

### Preparation

In [1]:
import numpy as np
import pandas as pd

In [2]:
from skbio import TreeNode

### Cell size data

In [3]:
df = pd.read_table('../preprocess/annot/species_all.tsv', index_col = 0)
df.rename(columns = {'superkingdom': 'kingdom'}, inplace = True)
df.head()

Unnamed: 0_level_0,length,width,volume,surface,shape,species,genus,family,order,class,phylum,kingdom
taxid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
11,2.371708,1.06066,1.783187,7.902917,rod-shaped,Cellulomonas gilvus,Cellulomonas,Cellulomonadaceae,Micrococcales,Actinomycetia,Actinobacteria,Bacteria
14,10.0,0.489898,1.854174,15.390598,rod-shaped,Dictyoglomus thermophilum,Dictyoglomus,Dictyoglomaceae,Dictyoglomales,Dictyoglomia,Dictyoglomi,Bacteria
23,1.5,0.7,0.48747,3.298672,rod-shaped,Shewanella colwelliana,Shewanella,Shewanellaceae,Alteromonadales,Gammaproteobacteria,Proteobacteria,Bacteria
104,1.03923,0.34641,0.087062,1.130973,ring-shaped,Cyclobacterium marinum,Cyclobacterium,Cyclobacteriaceae,Cytophagales,Cytophagia,Bacteroidetes,Bacteria
114,2.04939,2.04939,4.506844,13.194689,spherical,Gemmata obscuriglobus,Gemmata,Gemmataceae,Gemmatales,Planctomycetia,Planctomycetes,Bacteria


In [4]:
df.shape

(5702, 12)

In [5]:
df.index = 'taxid' + df.index.astype(str)

### Reference taxa

Taxonomy (tax2tree-curated)

In [6]:
dft = pd.read_table('tax2tree/filled_ranks.tsv', index_col = 0)
dft.head()

Unnamed: 0,kingdom,phylum,class,order,family,genus,species
G000005825,Bacteria,Firmicutes_1,Bacilli_1,Bacillales_1,Bacillaceae_3,Alkalihalobacillus,Alkalihalobacillus pseudofirmus
G000006175,Archaea,Euryarchaeota_2,Methanococci,Methanococcales,Methanococcaceae,Methanococcus,Methanococcus voltae
G000006605,Bacteria,Actinobacteria,Actinomycetia,Corynebacteriales,Corynebacteriaceae,Corynebacterium,Corynebacterium falsenii
G000006725,Bacteria,Proteobacteria_1,Gammaproteobacteria,Xanthomonadales,Xanthomonadaceae,Xylella,Xylella fastidiosa
G000006745,Bacteria,Proteobacteria_1,Gammaproteobacteria,Vibrionales,Vibrionaceae,,Vibrio cholerae


In [7]:
dft.shape[0]

10575

Genome metadata

In [8]:
dfg = pd.read_table('genome.tsv', index_col=0)
dfg.head()

Unnamed: 0_level_0,scope,assembly_level,total_length,gc,proteins,protein_length,coding_density,completeness,contamination,strain_heterogeneity,16s_copies
#genome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
G000005825,Monoisolate,Complete Genome,4249248,39.86,4260,1228222,85.144124,98.68,1.32,0.0,7
G000006175,Multispecies,Complete Genome,1936387,28.59,1727,526927,80.167033,99.05,0.0,0.0,2
G000006605,Monoisolate,Complete Genome,2476822,61.35,2137,751284,89.378688,100.0,0.68,0.0,3
G000006725,Monoisolate,Complete Genome,2731750,52.62,2664,765931,82.59299,99.59,0.18,0.0,2
G000006745,Monoisolate,Complete Genome,4033464,47.49,3594,1184587,86.533164,99.86,0.03,0.0,8


In [9]:
dfg = dfg[['total_length', 'gc', 'proteins', 'coding_density', '16s_copies']]

In [10]:
dfg.columns = ['genome', 'gc', 'proteins', 'coding', 'rrnas']

Add codon bias values

In [11]:
cb = pd.read_table('../codon_bias/cbias_corrected.tsv', header = None)
cb.shape

(10575, 3)

In [12]:
# Reformat data frame
cb.columns = ['#genome', 'MILC', 'ENCprime']
cb.set_index('#genome', inplace = True)

In [13]:
cb.head()

Unnamed: 0_level_0,MILC,ENCprime
#genome,Unnamed: 1_level_1,Unnamed: 2_level_1
G000005825,-0.80048,0.294172
G000006175,-0.536149,0.224257
G000006605,-0.622788,0.203396
G000006725,-0.404945,0.187888
G000006745,-0.965386,0.334472


In [14]:
dfg = dfg.join(cb)

### Reference phylogeny

In [15]:
tree = TreeNode.read('tree.nwk')

In [16]:
tree.count(tips=True)

10575

Calculate median node depths

In [17]:
for node in tree.postorder(include_self=True):
    if node.length is None:
        node.length = 0.0
    if node.is_tip():
        node.taxa = set([node.name])
        node.depths = [0.0]
        node.median = 0.0
    else:
        node.taxa = set().union(*[x.taxa for x in node.children])
        node.depths = [y + x.length for x in node.children for y in x.depths]   
        node.median = np.median(node.depths)

Calculate node properties, i.e., match tips and internal nodes

In [18]:
keys = ['genome', 'gc', 'proteins', 'coding', 'rrnas', 'MILC', 'ENCprime']

In [19]:
meta_ = []

In [20]:
for node in tree.postorder(include_self=True):
    meta_.append([node.name])
    if node.is_tip():
        row = dfg.loc[node.name]
        # Keys of row
        for key in keys:
            val = row[key]
            setattr(node, key, [val])
            meta_[-1].append(val)
    else:
        for key in keys:
            #vals = [y + x.length for x in node.children for y in getattr(x, key)]
            vals = [y for x in node.children for y in getattr(x, key)]
            setattr(node, key, vals)
            meta_[-1].append(np.median(vals))

In [21]:
# Create new data frame
dfm = pd.DataFrame(meta_, columns = ['ID'] + keys).set_index('ID')

In [22]:
dfm.head()

Unnamed: 0_level_0,genome,gc,proteins,coding,rrnas,MILC,ENCprime
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
G000830275,1241428.0,43.06,1341.0,90.880744,1.0,-0.05478,0.016435
G000402355,1098042.0,47.39,1206.0,90.408017,1.0,-0.138561,0.05531
N8,1169735.0,45.225,1273.5,90.64438,1.0,-0.096671,0.035872
G000830295,1157790.0,42.49,1309.0,92.968155,1.0,-0.17663,0.082367
G001871415,1534662.0,57.09,1517.0,88.962065,1.0,-0.10585,0.04784


In [23]:
dfm.shape

(20603, 7)

In [24]:
dfm = pd.concat([dfg[keys], dfm], axis=0)

In [25]:
dfm.head()

Unnamed: 0,genome,gc,proteins,coding,rrnas,MILC,ENCprime
G000005825,4249248.0,39.86,4260.0,85.144124,7.0,-0.80048,0.294172
G000006175,1936387.0,28.59,1727.0,80.167033,2.0,-0.536149,0.224257
G000006605,2476822.0,61.35,2137.0,89.378688,3.0,-0.622788,0.203396
G000006725,2731750.0,52.62,2664.0,82.59299,2.0,-0.404945,0.187888
G000006745,4033464.0,47.49,3594.0,86.533164,8.0,-0.965386,0.334472


In [26]:
dfm.shape

(31178, 7)

### Analysis - fine resolution (species)

In [27]:
ranks = dft.columns[::-1].tolist()[:-1]
ranks

['species', 'genus', 'family', 'order', 'class', 'phylum']

In [28]:
df['rank'] = None
df['node'] = None

Get lowest common ancestor

In [29]:
def lca2(tree, taxa):
    for child in tree.children:
        if taxa.issubset(child.taxa):
            return lca2(child, taxa)
    return tree

Determine placements in tree

In [30]:
for idx, row in df.iterrows():
    # Match by ranks
    for rank in ranks:
        # Match taxa and get genome ID
        taxa = set(dft[dft[rank] == row[rank]].index)
        if not taxa:
            continue
        # Get LCA
        node = lca2(tree, taxa)
        if node.taxa.difference(taxa):
            continue
        df.at[idx, 'node'] = node.name
        df.at[idx, 'rank'] = rank
        # If placement found, exit
        break

Check results

In [31]:
df.head()

Unnamed: 0_level_0,length,width,volume,surface,shape,species,genus,family,order,class,phylum,kingdom,rank,node
taxid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
taxid11,2.371708,1.06066,1.783187,7.902917,rod-shaped,Cellulomonas gilvus,Cellulomonas,Cellulomonadaceae,Micrococcales,Actinomycetia,Actinobacteria,Bacteria,species,G000218545
taxid14,10.0,0.489898,1.854174,15.390598,rod-shaped,Dictyoglomus thermophilum,Dictyoglomus,Dictyoglomaceae,Dictyoglomales,Dictyoglomia,Dictyoglomi,Bacteria,species,G000020965
taxid23,1.5,0.7,0.48747,3.298672,rod-shaped,Shewanella colwelliana,Shewanella,Shewanellaceae,Alteromonadales,Gammaproteobacteria,Proteobacteria,Bacteria,species,G000518705
taxid104,1.03923,0.34641,0.087062,1.130973,ring-shaped,Cyclobacterium marinum,Cyclobacterium,Cyclobacteriaceae,Cytophagales,Cytophagia,Bacteroidetes,Bacteria,species,G000222485
taxid114,2.04939,2.04939,4.506844,13.194689,spherical,Gemmata obscuriglobus,Gemmata,Gemmataceae,Gemmatales,Planctomycetia,Planctomycetes,Bacteria,species,G000171775


In [32]:
df.shape[0]

5702

In [33]:
df.dropna(subset=['node']).shape[0]

5566

In [34]:
# df.dropna(subset=['node'], inplace = True)
# df.shape

Check statistics of placements

In [35]:
for rank in ranks:
    print(rank, df.query(f'rank == "{rank}"').shape[0])

species 1491
genus 3085
family 804
order 61
class 111
phylum 14


### Last three ranks

Valid ranks would be species, genus and family.

In [36]:
valid_ranks = ('species', 'genus', 'family')

Make a copy of the tree.

In [37]:
placed = tree.copy()

Insert leafs into the tree, i.e., remove placements at higher levels.

In [38]:
inserted = []

In [39]:
for idx, row in df.iterrows():
    if row['rank'] in valid_ranks:
        node = placed.find(row['node'])
        leaf = TreeNode(name = idx, length = node.length + node.median)
        # Set leaf
        node.parent.append(leaf)
        inserted.append(idx)

Prune the tree to contain only insertions.

In [40]:
placed = placed.shear(inserted)

In [41]:
placed.prune()

In [42]:
placed.count(tips = True)

5380

Export tree

In [43]:
placed.write('place/fine_all.nwk')

'place/fine_all.nwk'

Filter data.

In [44]:
df_ = df.loc[inserted].copy()

Add metadata

In [45]:
for key in keys:
    df_[key] = df_['node'].map(dfm[key].to_dict())

In [46]:
df_.head()

Unnamed: 0_level_0,length,width,volume,surface,shape,species,genus,family,order,class,...,kingdom,rank,node,genome,gc,proteins,coding,rrnas,MILC,ENCprime
taxid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
taxid11,2.371708,1.06066,1.783187,7.902917,rod-shaped,Cellulomonas gilvus,Cellulomonas,Cellulomonadaceae,Micrococcales,Actinomycetia,...,Bacteria,species,G000218545,3526441.0,73.81,3206.0,91.772782,2.0,-0.262005,0.100836
taxid14,10.0,0.489898,1.854174,15.390598,rod-shaped,Dictyoglomus thermophilum,Dictyoglomus,Dictyoglomaceae,Dictyoglomales,Dictyoglomia,...,Bacteria,species,G000020965,1959987.0,33.74,1890.0,93.777255,2.0,-0.064429,0.030205
taxid23,1.5,0.7,0.48747,3.298672,rod-shaped,Shewanella colwelliana,Shewanella,Shewanellaceae,Alteromonadales,Gammaproteobacteria,...,Bacteria,species,G000518705,4575622.0,45.39,4094.0,87.383136,0.0,-0.653363,0.248987
taxid104,1.03923,0.34641,0.087062,1.130973,ring-shaped,Cyclobacterium marinum,Cyclobacterium,Cyclobacteriaceae,Cytophagales,Cytophagia,...,Bacteria,species,G000222485,6221273.0,38.15,5114.0,87.121317,3.0,-0.200638,0.102722
taxid114,2.04939,2.04939,4.506844,13.194689,spherical,Gemmata obscuriglobus,Gemmata,Gemmataceae,Gemmatales,Planctomycetia,...,Bacteria,species,G000171775,9161841.0,67.18,8067.0,84.083188,3.0,-0.154188,0.056476


In [47]:
df_.shape

(5380, 21)

In [48]:
#Statistics of placements
for rank in ranks:
    print(rank, df_.query(f'rank == "{rank}"').shape[0])

species 1491
genus 3085
family 804
order 0
class 0
phylum 0


Add dumb hash colum to calculate a phylogenetic signal

In [49]:
def dumbHash(name):
    h = (hash(name) >> 55) / 100
    if h < 0:
        return -h
    else:
        return h

In [50]:
df_['hash'] = df_['species'].apply(dumbHash)

Export data

In [51]:
df_.to_csv('place/fine_all.tsv', sep = '\t')

Percentage of entries inferred by collapsing and averaging

In [52]:
(df_[df_['node'].str.contains('N')].shape[0] / df_.shape[0]) * 100

63.048327137546465

### Just species level

In [65]:
placed = tree.copy()
inserted = []
for idx, row in df.query('rank == "species"').iterrows():
    node = placed.find(row['node'])
    leaf = TreeNode(name=idx, length=node.length + node.median)
    node.parent.append(leaf)
    inserted.append(idx)
placed = placed.shear(inserted)
placed.prune()
placed.write('place/species_all.nwk')
df_ = df.loc[inserted].copy()
for key in keys:
    df_[key] = df_['node'].map(dfm[key].to_dict())
df_.to_csv('place/species_all.tsv', sep='\t')

In [66]:
# Percentage of entries inferred by ancestral state reconstruction
(df_[df_['node'].str.contains('N')].shape[0] / df_.shape[0]) * 100

8.58484238765929

In [67]:
df_.shape

(1491, 21)

In [68]:
df_[df_['node'].str.contains('N')].shape[0]

128

### Higher ranks

In [69]:
for rank in 'genus', 'family', 'order', 'class', 'phylum':
    df = pd.read_table(f'../preprocess/annot/{rank}_all.tsv', index_col=0)
    df.index = 'txid' + df.index.astype(str)
    df['rank'] = None
    df['node'] = None
    for idx, row in df.iterrows():
        taxa = set(dft[dft[rank] == row[rank]].index)
        if not taxa:
            continue
        node = lca2(tree, taxa)
        if node.taxa.difference(taxa):
            continue
        df.at[idx, 'node'] = node.name
        df.at[idx, 'rank'] = rank
    df = df.dropna(subset=['node'])
    placed = tree.copy()
    inserted = []
    for idx, row in df.iterrows():
        node = placed.find(row['node'])
        leaf = TreeNode(name=idx, length=node.length + node.median)
        node.parent.append(leaf)
        inserted.append(idx)
    placed = placed.shear(inserted)
    placed.prune()
    placed.write(f'place/{rank}_all.nwk')
    df = df.loc[inserted]
    for key in keys:
        df[key] = df['node'].map(dfm[key].to_dict())
    df.to_csv(f'place/{rank}_all.tsv', sep='\t')
    print(f'{rank}: {df.shape[0]}')

genus: 1143
family: 342
order: 141
class: 60
phylum: 27
