# Match cell size data with phylogeny

In [1]:
import numpy as np
import pandas as pd

In [2]:
from skbio import TreeNode

Load data

In [3]:
df = pd.read_table('preprocess/annot/species_all.tsv', index_col = 0)
df.rename(columns = {'superkingdom': 'kingdom'}, inplace = True)
df.shape

(5702, 19)

In [4]:
df.index = 'taxid' + df.index.astype(str)

In [5]:
df['length_ranges'] = df['length_ranges'].map(lambda x: x[1:-1].strip().replace("'", ''))
df['width_ranges'] = df['width_ranges'].map(lambda x: x[1:-1].strip().replace("'", ''))
df['sources'] = df['sources'].map(lambda x: x[1:-1].strip().replace("'", ''))

### Reference taxa

Taxonomy (tax2tree-curated)

In [6]:
dft = pd.read_table('../phylogeny/tax2tree/filled_ranks.tsv', index_col = 0)
dft.shape

(10575, 7)

Genome metadata

In [7]:
dfg = pd.read_table('../phylogeny/genome.tsv', index_col=0)
dfg.shape

(10575, 11)

In [8]:
dfg = dfg[['total_length', 'gc', 'proteins', 'coding_density', '16s_copies']]

In [9]:
dfg.columns = ['genome', 'gc', 'proteins', 'coding', 'rrnas']

Add codon bias values

In [10]:
cb = pd.read_table('../codon_bias/cbias_corrected.tsv', header = None)
cb.shape

(10575, 3)

In [11]:
# Reformat data frame
cb.columns = ['#genome', 'MILC', 'ENCprime']
cb.set_index('#genome', inplace = True)

In [12]:
dfg = dfg.join(cb)

### Reference phylogeny

In [13]:
tree = TreeNode.read('../phylogeny/tree.nwk')

In [14]:
tree.count(tips=True)

10575

Calculate median node depths

In [15]:
for node in tree.postorder(include_self=True):
    if node.length is None:
        node.length = 0.0
    if node.is_tip():
        node.taxa = set([node.name])
        node.depths = [0.0]
        node.median = 0.0
    else:
        node.taxa = set().union(*[x.taxa for x in node.children])
        node.depths = [y + x.length for x in node.children for y in x.depths]   
        node.median = np.median(node.depths)

Calculate node properties, i.e., match tips and internal nodes

In [16]:
keys = ['genome', 'gc', 'proteins', 'coding', 'rrnas', 'MILC', 'ENCprime']

In [17]:
meta_ = []

In [18]:
for node in tree.postorder(include_self=True):
    meta_.append([node.name])
    if node.is_tip():
        row = dfg.loc[node.name]
        # Keys of row
        for key in keys:
            val = row[key]
            setattr(node, key, [val])
            meta_[-1].append(val)
    else:
        for key in keys:
            #vals = [y + x.length for x in node.children for y in getattr(x, key)]
            vals = [y for x in node.children for y in getattr(x, key)]
            setattr(node, key, vals)
            meta_[-1].append(np.median(vals))

In [19]:
# Create new data frame
dfm = pd.DataFrame(meta_, columns = ['ID'] + keys).set_index('ID')

In [20]:
dfm.shape

(20603, 7)

In [21]:
dfm = pd.concat([dfg[keys], dfm], axis=0)

In [22]:
dfm.shape

(31178, 7)

### Analysis - fine resolution (species)

In [23]:
ranks = dft.columns[::-1].tolist()[:-1]
ranks

['species', 'genus', 'family', 'order', 'class', 'phylum']

In [24]:
df['rank'] = None
df['node'] = None

Get lowest common ancestor

In [25]:
def lca2(tree, taxa):
    for child in tree.children:
        if taxa.issubset(child.taxa):
            return lca2(child, taxa)
    return tree

Determine placements in tree

In [26]:
for idx, row in df.iterrows():
    # Match by ranks
    for rank in ranks:
        # Match taxa and get genome ID
        taxa = set(dft[dft[rank] == row[rank]].index)
        if not taxa:
            continue
        # Get LCA
        node = lca2(tree, taxa)
        if node.taxa.difference(taxa):
            continue
        df.at[idx, 'node'] = node.name
        df.at[idx, 'rank'] = rank
        # If placement found, exit
        break

Check results

In [27]:
df.head()

Unnamed: 0_level_0,length_gmean,width_gmean,volume_gmean,surface_gmean,length_amean,width_amean,volume_amean,surface_amean,shape,length_ranges,...,sources,species,genus,family,order,class,phylum,kingdom,rank,node
taxid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
taxid11,2.371708,1.06066,1.783187,7.902917,2.625,1.125,2.236544,9.277516,rod-shaped,"1.5, 3.75",...,bacdive,Cellulomonas gilvus,Cellulomonas,Cellulomonadaceae,Micrococcales,Actinomycetia,Actinobacteria,Bacteria,species,G000218545
taxid14,10.0,0.489898,1.854174,15.390598,12.5,0.5,2.421644,19.634954,rod-shaped,"5.0, 20.0",...,bacdive,Dictyoglomus thermophilum,Dictyoglomus,Dictyoglomaceae,Dictyoglomales,Dictyoglomia,Dictyoglomi,Bacteria,species,G000020965
taxid23,1.5,0.7,0.48747,3.298672,1.5,0.7,0.48747,3.298672,rod-shaped,"1.5, 1.5",...,bacdive,Shewanella colwelliana,Shewanella,Shewanellaceae,Alteromonadales,Gammaproteobacteria,Proteobacteria,Bacteria,species,G000518705
taxid104,1.03923,0.34641,0.087062,1.130973,1.05,0.35,0.089797,1.154535,ring-shaped,"0.9, 1.2",...,bacdive,Cyclobacterium marinum,Cyclobacterium,Cyclobacteriaceae,Cytophagales,Cytophagia,Bacteroidetes,Bacteria,species,G000222485
taxid114,2.04939,2.04939,4.506844,13.194689,2.2,2.2,5.57528,15.205308,spherical,"1.4, 3.0",...,bm4,Gemmata obscuriglobus,Gemmata,Gemmataceae,Gemmatales,Planctomycetia,Planctomycetes,Bacteria,species,G000171775


In [28]:
df.shape

(5702, 21)

In [29]:
df.dropna(subset = ['node']).shape

(5566, 21)

Check statistics of placements

In [30]:
for rank in ranks:
    print(rank, df.query(f'rank == "{rank}"').shape[0])

species 1491
genus 3085
family 804
order 61
class 111
phylum 14


### Last three ranks

Valid ranks would be species, genus and family.

In [31]:
valid_ranks = ('species', 'genus', 'family')

Make a copy of the tree.

In [32]:
placed = tree.copy()

Insert leafs into the tree, i.e., remove placements at higher levels.

In [33]:
inserted = []

In [34]:
for idx, row in df.iterrows():
    if row['rank'] in valid_ranks:
        node = placed.find(row['node'])
        leaf = TreeNode(name = idx, length = node.length + node.median)
        # Set leaf
        node.parent.append(leaf)
        inserted.append(idx)

Prune the tree to contain only insertions.

In [35]:
placed = placed.shear(inserted)

In [36]:
placed.prune()

In [37]:
placed.count(tips = True)

5380

Export tree

In [38]:
placed.write('place/fine_all.nwk')

'place/fine_all.nwk'

Filter data.

In [39]:
df_ = df.loc[inserted].copy()

Add metadata

In [40]:
for key in keys:
    df_[key] = df_['node'].map(dfm[key].to_dict())

In [41]:
df_.shape

(5380, 28)

In [42]:
#Statistics of placements
for rank in ranks:
    print(rank, df_.query(f'rank == "{rank}"').shape[0])

species 1491
genus 3085
family 804
order 0
class 0
phylum 0


Save dataframe

In [43]:
df_.to_csv('place/fine_all.tsv', sep = '\t')

# Only BacDive

In [47]:
df_bacd = df[df['sources'] == 'bacdive'].copy()

In [48]:
df_bacd.shape

(4842, 21)

In [49]:
df_bacd.dropna(subset = ['node']).shape

(4713, 21)

Check statistics of placements

In [50]:
for rank in ranks:
    print(rank, df_bacd.query(f'rank == "{rank}"').shape[0])

species 1130
genus 2700
family 716
order 58
class 100
phylum 9


### Last three ranks

Valid ranks would be species, genus and family.

In [53]:
valid_ranks = ('species', 'genus', 'family')

In [52]:
placed_bacd = tree.copy()

Insert leafs into the tree, i.e., remove placements at higher levels.

In [55]:
inserted = []

In [56]:
for idx, row in df_bacd.iterrows():
    if row['rank'] in valid_ranks:
        node = placed_bacd.find(row['node'])
        leaf = TreeNode(name = idx, length = node.length + node.median)
        # Set leaf
        node.parent.append(leaf)
        inserted.append(idx)

Prune the tree to contain only insertions.

In [57]:
placed_bacd = placed_bacd.shear(inserted)

In [58]:
placed_bacd.prune()

In [59]:
placed_bacd.count(tips = True)

4546

Export tree

In [62]:
placed_bacd.write('place/fine_bacd_all.nwk')

'place/fine_bacd_all.nwk'

Filter data

In [63]:
df_ = df_bacd.loc[inserted].copy()

Add metadata

In [65]:
for key in keys:
    df_[key] = df_['node'].map(dfm[key].to_dict())

In [66]:
df_.shape

(4546, 28)

In [67]:
#Statistics of placements
for rank in ranks:
    print(rank, df_.query(f'rank == "{rank}"').shape[0])

species 1130
genus 2700
family 716
order 0
class 0
phylum 0


Save dataframe

In [68]:
df_.to_csv('place/fine_bacd_all.tsv', sep = '\t')