# Match cell size data with phylogeny

### Preparation

In [1]:
import numpy as np
import pandas as pd

In [2]:
from skbio import TreeNode

In [3]:
#!mkdir -p place

### Cell size data

In [4]:
df = pd.read_table('../preprocess/annot/species.tsv', index_col=0)
df.head()

Unnamed: 0_level_0,length,width,volume,surface,shape,species,genus,family,order,class,phylum,superkingdom
taxid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
11,2.371708,1.06066,1.783187,7.902917,rod-shaped,Cellulomonas gilvus,Cellulomonas,Cellulomonadaceae,Micrococcales,Actinomycetia,Actinobacteria,Bacteria
14,10.0,0.489898,1.854174,15.390598,rod-shaped,Dictyoglomus thermophilum,Dictyoglomus,Dictyoglomaceae,Dictyoglomales,Dictyoglomia,Dictyoglomi,Bacteria
23,1.5,0.7,0.48747,3.298672,rod-shaped,Shewanella colwelliana,Shewanella,Shewanellaceae,Alteromonadales,Gammaproteobacteria,Proteobacteria,Bacteria
104,1.03923,0.34641,0.087062,1.130973,ring-shaped,Cyclobacterium marinum,Cyclobacterium,Cyclobacteriaceae,Cytophagales,Cytophagia,Bacteroidetes,Bacteria
148,67.082039,0.244949,3.157319,51.621635,,Sediminispirochaeta bajacaliforniensis,Sediminispirochaeta,Spirochaetaceae,Spirochaetales,Spirochaetia,Spirochaetes,Bacteria


In [5]:
df.shape[0]

4875

In [6]:
df.index = 'txid' + df.index.astype(str)

### Reference taxa

Taxonomy (tax2tree-curated)

In [7]:
dft = pd.read_table('tax2tree/filled_ranks.tsv', index_col=0)
dft.head()

Unnamed: 0,kingdom,phylum,class,order,family,genus,species
G000005825,Bacteria,Firmicutes_1,Bacilli_1,Bacillales_1,Bacillaceae_3,Alkalihalobacillus,Alkalihalobacillus pseudofirmus
G000006175,Archaea,Euryarchaeota_2,Methanococci,Methanococcales,Methanococcaceae,Methanococcus,Methanococcus voltae
G000006605,Bacteria,Actinobacteria,Actinomycetia,Corynebacteriales,Corynebacteriaceae,Corynebacterium,Corynebacterium falsenii
G000006725,Bacteria,Proteobacteria_1,Gammaproteobacteria,Xanthomonadales,Xanthomonadaceae,Xylella,Xylella fastidiosa
G000006745,Bacteria,Proteobacteria_1,Gammaproteobacteria,Vibrionales,Vibrionaceae,,Vibrio cholerae


In [8]:
dft.shape[0]

10575

Genome metadata

In [9]:
dfg = pd.read_table('genome.tsv', index_col=0)
dfg.head()

Unnamed: 0_level_0,scope,assembly_level,total_length,gc,proteins,protein_length,coding_density,completeness,contamination,strain_heterogeneity,16s_copies
#genome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
G000005825,Monoisolate,Complete Genome,4249248,39.86,4260,1228222,85.144124,98.68,1.32,0.0,7
G000006175,Multispecies,Complete Genome,1936387,28.59,1727,526927,80.167033,99.05,0.0,0.0,2
G000006605,Monoisolate,Complete Genome,2476822,61.35,2137,751284,89.378688,100.0,0.68,0.0,3
G000006725,Monoisolate,Complete Genome,2731750,52.62,2664,765931,82.59299,99.59,0.18,0.0,2
G000006745,Monoisolate,Complete Genome,4033464,47.49,3594,1184587,86.533164,99.86,0.03,0.0,8


Correct genome length by completeness / contamination.

In [10]:
# dfg['effective_length'] = dfg['total_length'] / (dfg['completeness'] - dfg['contamination']) * 100

In [11]:
dfg = dfg[['total_length', 'gc', 'proteins', 'coding_density', '16s_copies']]

In [12]:
dfg.columns = ['genome', 'gc', 'proteins', 'coding', 'rrnas']

### Reference phylogeny

In [13]:
tree = TreeNode.read('tree.nwk')

In [14]:
tree.count(tips=True)

10575

Calculate median node depths

In [15]:
for node in tree.postorder(include_self=True):
    if node.length is None:
        node.length = 0.0
    if node.is_tip():
        node.taxa = set([node.name])
        node.depths = [0.0]
        node.median = 0.0
    else:
        node.taxa = set().union(*[x.taxa for x in node.children])
        node.depths = [y + x.length for x in node.children for y in x.depths]
        node.median = np.median(node.depths)

Calculate node properties

In [16]:
keys = ['genome', 'gc', 'proteins', 'coding', 'rrnas']

In [17]:
meta_ = []

In [18]:
for node in tree.postorder(include_self=True):
    meta_.append([node.name])
    if node.is_tip():
        row = dfg.loc[node.name]
        for key in keys:
            val = row[key]
            setattr(node, key, [val])
            meta_[-1].append(val)
    else:
        for key in keys:
            vals = [y + x.length for x in node.children for y in getattr(x, key)]
            setattr(node, key, vals)
            meta_[-1].append(np.mean(vals))

In [19]:
dfm = pd.DataFrame(meta_, columns=['ID'] + keys).set_index('ID')

In [20]:
dfm = pd.concat([dfg[keys], dfm], axis=0)

In [21]:
dfm.head()

Unnamed: 0,genome,gc,proteins,coding,rrnas
G000005825,4249248.0,39.86,4260.0,85.144124,7.0
G000006175,1936387.0,28.59,1727.0,80.167033,2.0
G000006605,2476822.0,61.35,2137.0,89.378688,3.0
G000006725,2731750.0,52.62,2664.0,82.59299,2.0
G000006745,4033464.0,47.49,3594.0,86.533164,8.0


### Analysis - fine resolution (species)

In [22]:
ranks = dft.columns[::-1].tolist()[:-1]
ranks

['species', 'genus', 'family', 'order', 'class', 'phylum']

In [23]:
df['rank'] = None
df['node'] = None

Get lowest common ancestor

In [24]:
def lca2(tree, taxa):
    for child in tree.children:
        if taxa.issubset(child.taxa):
            return lca2(child, taxa)
    return tree

Determine placements

In [25]:
for idx, row in df.iterrows():
    for rank in ranks:
        taxa = set(dft[dft[rank] == row[rank]].index)
        if not taxa:
            continue
        node = lca2(tree, taxa)
        if node.taxa.difference(taxa):
            continue
        df.at[idx, 'node'] = node.name
        df.at[idx, 'rank'] = rank
        break

Check results

In [26]:
df.head()

Unnamed: 0_level_0,length,width,volume,surface,shape,species,genus,family,order,class,phylum,superkingdom,rank,node
taxid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
txid11,2.371708,1.06066,1.783187,7.902917,rod-shaped,Cellulomonas gilvus,Cellulomonas,Cellulomonadaceae,Micrococcales,Actinomycetia,Actinobacteria,Bacteria,species,G000218545
txid14,10.0,0.489898,1.854174,15.390598,rod-shaped,Dictyoglomus thermophilum,Dictyoglomus,Dictyoglomaceae,Dictyoglomales,Dictyoglomia,Dictyoglomi,Bacteria,species,G000020965
txid23,1.5,0.7,0.48747,3.298672,rod-shaped,Shewanella colwelliana,Shewanella,Shewanellaceae,Alteromonadales,Gammaproteobacteria,Proteobacteria,Bacteria,species,G000518705
txid104,1.03923,0.34641,0.087062,1.130973,ring-shaped,Cyclobacterium marinum,Cyclobacterium,Cyclobacteriaceae,Cytophagales,Cytophagia,Bacteroidetes,Bacteria,species,G000222485
txid148,67.082039,0.244949,3.157319,51.621635,,Sediminispirochaeta bajacaliforniensis,Sediminispirochaeta,Spirochaetaceae,Spirochaetales,Spirochaetia,Spirochaetes,Bacteria,genus,G000143985


In [27]:
df.shape[0]

4875

In [28]:
df.dropna(subset=['node']).shape[0]

4746

In [29]:
for rank in ranks:
    print(rank, df.query(f'rank == "{rank}"').shape[0])

species 1140
genus 2718
family 719
order 58
class 102
phylum 9


### Last three ranks

Valid ranks would be species, genus and family.

In [30]:
valid_ranks = ('species', 'genus', 'family')

Make a copy of the tree.

In [31]:
placed = tree.copy()

Insert leaves into the tree.

In [32]:
inserted = []

In [33]:
for idx, row in df.iterrows():
    if row['rank'] in valid_ranks:
        node = placed.find(row['node'])
        leaf = TreeNode(name=idx, length=node.length + node.median)
        node.parent.append(leaf)
        inserted.append(idx)

Prune the tree to contain only insertions.

In [34]:
placed = placed.shear(inserted)

In [35]:
placed.prune()

Export tree.

In [36]:
placed.write('place/fine.nwk')

'place/fine.nwk'

In [37]:
placed.count(tips = True)

4577

Filter data.

In [38]:
df_ = df.loc[inserted].copy()

In [39]:
for key in keys:
    df_[key] = df_['node'].map(dfm[key].to_dict())

In [40]:
df_.shape

(4577, 19)

Export data.

In [41]:
df_.to_csv('place/fine.tsv', sep='\t')

### Small tree

For test purpose.

In [42]:
df_ = df_.sample(100, random_state=42)
df_.to_csv('place/100.tsv', sep='\t')
placed_ = placed.shear(df_.index.tolist())
placed_.prune()
placed_.write('place/100.nwk')

'place/100.nwk'

### Just species level

In [43]:
placed = tree.copy()
inserted = []
for idx, row in df.query('rank == "species"').iterrows():
    node = placed.find(row['node'])
    leaf = TreeNode(name=idx, length=node.length + node.median)
    node.parent.append(leaf)
    inserted.append(idx)
placed = placed.shear(inserted)
placed.prune()
placed.write('place/species.nwk')
df_ = df.loc[inserted].copy()
for key in keys:
    df_[key] = df_['node'].map(dfm[key].to_dict())
df_.to_csv('place/species.tsv', sep='\t')

### Higher ranks

In [45]:
for rank in 'genus', 'family', 'order', 'class', 'phylum':
    df = pd.read_table(f'../preprocess/annot/{rank}.tsv', index_col=0)
    df.index = 'txid' + df.index.astype(str)
    df['rank'] = None
    df['node'] = None
    for idx, row in df.iterrows():
        taxa = set(dft[dft[rank] == row[rank]].index)
        if not taxa:
            continue
        node = lca2(tree, taxa)
        if node.taxa.difference(taxa):
            continue
        df.at[idx, 'node'] = node.name
        df.at[idx, 'rank'] = rank
    df = df.dropna(subset=['node'])
    placed = tree.copy()
    inserted = []
    for idx, row in df.iterrows():
        node = placed.find(row['node'])
        leaf = TreeNode(name=idx, length=node.length + node.median)
        node.parent.append(leaf)
        inserted.append(idx)
    placed = placed.shear(inserted)
    placed.prune()
    placed.write(f'place/{rank}.nwk')
    df = df.loc[inserted]
    for key in keys:
        df[key] = df['node'].map(dfm[key].to_dict())
    df.to_csv(f'place/{rank}.tsv', sep='\t')
    print(f'{rank}: {df.shape[0]}')

genus: 1027
family: 300
order: 122
class: 55
phylum: 25
