# Match cell size data (cyanobacteria) with phylogeny

### Preparation

In [1]:
import numpy as np
import pandas as pd

In [2]:
from skbio import TreeNode

### Cell size data

In [3]:
df = pd.read_table('../preprocess/annot/species_cyanobacteria.tsv', index_col=0)
df.head()

Unnamed: 0_level_0,length,width,volume,surface,shape,species,genus,family,order,class,phylum,superkingdom
taxid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1126,3.24037,3.24037,17.814866,32.986723,sphere/spheroid,Microcystis aeruginosa,Microcystis,Microcystaceae,Chroococcales,,Cyanobacteria,Bacteria
1129,1.309547,1.309547,1.175878,5.387561,sphere/spheroid,,Synechococcus,Synechococcaceae,Synechococcales,,Cyanobacteria,Bacteria
1142,2.959481,2.959481,13.572041,27.515722,sphere/spheroid,,Synechocystis,Merismopediaceae,Synechococcales,,Cyanobacteria,Bacteria
1144,3.741657,3.741657,27.427781,43.982297,sphere/spheroid,Synechocystis sp. PCC 6701,Geminocystis,Chroococcaceae,Chroococcales,,Cyanobacteria,Bacteria
1148,2.397916,2.397916,7.219388,18.064158,sphere/spheroid,Synechocystis sp. PCC 6803,Synechocystis,Merismopediaceae,Synechococcales,,Cyanobacteria,Bacteria


In [4]:
df.shape

(98, 12)

In [5]:
df.index = 'taxid' + df.index.astype(str)

### Reference taxa

Taxonomy (tax2tree-curated)

In [6]:
dft = pd.read_table('tax2tree/filled_ranks.tsv', index_col=0)
dft.head()

Unnamed: 0,kingdom,phylum,class,order,family,genus,species
G000005825,Bacteria,Firmicutes_1,Bacilli_1,Bacillales_1,Bacillaceae_3,Alkalihalobacillus,Alkalihalobacillus pseudofirmus
G000006175,Archaea,Euryarchaeota_2,Methanococci,Methanococcales,Methanococcaceae,Methanococcus,Methanococcus voltae
G000006605,Bacteria,Actinobacteria,Actinomycetia,Corynebacteriales,Corynebacteriaceae,Corynebacterium,Corynebacterium falsenii
G000006725,Bacteria,Proteobacteria_1,Gammaproteobacteria,Xanthomonadales,Xanthomonadaceae,Xylella,Xylella fastidiosa
G000006745,Bacteria,Proteobacteria_1,Gammaproteobacteria,Vibrionales,Vibrionaceae,,Vibrio cholerae


In [7]:
dft.shape[0]

10575

Genome metadata

In [8]:
dfg = pd.read_table('genome.tsv', index_col=0)
dfg.head()

Unnamed: 0_level_0,scope,assembly_level,total_length,gc,proteins,protein_length,coding_density,completeness,contamination,strain_heterogeneity,16s_copies
#genome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
G000005825,Monoisolate,Complete Genome,4249248,39.86,4260,1228222,85.144124,98.68,1.32,0.0,7
G000006175,Multispecies,Complete Genome,1936387,28.59,1727,526927,80.167033,99.05,0.0,0.0,2
G000006605,Monoisolate,Complete Genome,2476822,61.35,2137,751284,89.378688,100.0,0.68,0.0,3
G000006725,Monoisolate,Complete Genome,2731750,52.62,2664,765931,82.59299,99.59,0.18,0.0,2
G000006745,Monoisolate,Complete Genome,4033464,47.49,3594,1184587,86.533164,99.86,0.03,0.0,8


In [9]:
dfg = dfg[['total_length', 'gc', 'proteins', 'coding_density', '16s_copies']]

In [10]:
dfg.columns = ['genome', 'gc', 'proteins', 'coding', 'rrnas']

### Reference phylogeny

In [11]:
tree = TreeNode.read('tree.nwk')

In [12]:
tree.count(tips=True)

10575

Calculate median node depths

In [13]:
for node in tree.postorder(include_self=True):
    if node.length is None:
        node.length = 0.0
    if node.is_tip():
        node.taxa = set([node.name])
        node.depths = [0.0]
        node.median = 0.0
    else:
        node.taxa = set().union(*[x.taxa for x in node.children])
        node.depths = [y + x.length for x in node.children for y in x.depths]
        node.median = np.median(node.depths)

Calculate node properties, i.e., match tips and internal nodes

In [14]:
keys = ['genome', 'gc', 'proteins', 'coding', 'rrnas']

In [15]:
meta_ = []

In [16]:
for node in tree.postorder(include_self=True):
    meta_.append([node.name])
    if node.is_tip():
        row = dfg.loc[node.name]
        # Keys of row
        for key in keys:
            val = row[key]
            setattr(node, key, [val])
            meta_[-1].append(val)
    else:
        for key in keys:
            vals = [y + x.length for x in node.children for y in getattr(x, key)] # why val is times x.length?
            setattr(node, key, vals)
            meta_[-1].append(np.mean(vals))

In [17]:
# Create new data frame
dfm = pd.DataFrame(meta_, columns = ['ID'] + keys).set_index('ID')

In [18]:
dfm.shape

(20603, 5)

In [19]:
dfm = pd.concat([dfg[keys], dfm], axis=0)

In [20]:
dfm.head()

Unnamed: 0,genome,gc,proteins,coding,rrnas
G000005825,4249248.0,39.86,4260.0,85.144124,7.0
G000006175,1936387.0,28.59,1727.0,80.167033,2.0
G000006605,2476822.0,61.35,2137.0,89.378688,3.0
G000006725,2731750.0,52.62,2664.0,82.59299,2.0
G000006745,4033464.0,47.49,3594.0,86.533164,8.0


In [21]:
dfm.shape

(31178, 5)

### Analysis - fine resolution (species)

In [22]:
ranks = dft.columns[::-1].tolist()[:-1]
ranks

['species', 'genus', 'family', 'order', 'class', 'phylum']

In [23]:
df['rank'] = None
df['node'] = None

Get lowest common ancestor

In [24]:
def lca2(tree, taxa):
    for child in tree.children:
        if taxa.issubset(child.taxa):
            return lca2(child, taxa)
    return tree

Determine placements in tree

In [25]:
for idx, row in df.iterrows():
    # Match by ranks
    for rank in ranks:
        # Match taxa and get genome ID
        taxa = set(dft[dft[rank] == row[rank]].index)
        if not taxa:
            continue
        # Get LCA
        node = lca2(tree, taxa)
        if node.taxa.difference(taxa):
            continue
        df.at[idx, 'node'] = node.name
        df.at[idx, 'rank'] = rank
        # If placement found, exit
        break

Check results

In [26]:
df.head()

Unnamed: 0_level_0,length,width,volume,surface,shape,species,genus,family,order,class,phylum,superkingdom,rank,node
taxid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
taxid1126,3.24037,3.24037,17.814866,32.986723,sphere/spheroid,Microcystis aeruginosa,Microcystis,Microcystaceae,Chroococcales,,Cyanobacteria,Bacteria,species,N6752
taxid1129,1.309547,1.309547,1.175878,5.387561,sphere/spheroid,,Synechococcus,Synechococcaceae,Synechococcales,,Cyanobacteria,Bacteria,phylum,N224
taxid1142,2.959481,2.959481,13.572041,27.515722,sphere/spheroid,,Synechocystis,Merismopediaceae,Synechococcales,,Cyanobacteria,Bacteria,genus,N5451
taxid1144,3.741657,3.741657,27.427781,43.982297,sphere/spheroid,Synechocystis sp. PCC 6701,Geminocystis,Chroococcaceae,Chroococcales,,Cyanobacteria,Bacteria,genus,N5887
taxid1148,2.397916,2.397916,7.219388,18.064158,sphere/spheroid,Synechocystis sp. PCC 6803,Synechocystis,Merismopediaceae,Synechococcales,,Cyanobacteria,Bacteria,species,N5888


In [27]:
df.shape

(98, 14)

In [28]:
df.dropna(subset=['node']).shape[0]

98

Check statistics of placements

In [29]:
for rank in ranks:
    print(rank, df.query(f'rank == "{rank}"').shape[0])

species 50
genus 34
family 10
order 0
class 0
phylum 4


### Last three ranks

Valid ranks would be species, genus and family.

In [30]:
valid_ranks = ('species', 'genus', 'family')

Make a copy of the tree.

In [31]:
placed = tree.copy()

Insert leafs into the tree, i.e., remove placements at higher levels.

In [32]:
inserted = []

In [33]:
for idx, row in df.iterrows():
    if row['rank'] in valid_ranks:
        node = placed.find(row['node'])
        leaf = TreeNode(name = idx, length = node.length + node.median)
        # Set leaf
        node.parent.append(leaf)
        inserted.append(idx)

Prune the tree to contain only insertions.

In [34]:
placed = placed.shear(inserted)

In [35]:
placed.prune()

Export tree

In [36]:
placed.write('place/fine_cyanobacteria.nwk')

'place/fine_cyanobacteria.nwk'

Filter data.

In [37]:
df_ = df.loc[inserted].copy()

Add metadata

In [38]:
for key in keys:
    df_[key] = df_['node'].map(dfm[key].to_dict())

In [39]:
df_.head()

Unnamed: 0_level_0,length,width,volume,surface,shape,species,genus,family,order,class,phylum,superkingdom,rank,node,genome,gc,proteins,coding,rrnas
taxid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
taxid1126,3.24037,3.24037,17.814866,32.986723,sphere/spheroid,Microcystis aeruginosa,Microcystis,Microcystaceae,Chroococcales,,Cyanobacteria,Bacteria,species,N6752,4940384.0,42.69248,4826.633909,81.566261,1.348195
taxid1142,2.959481,2.959481,13.572041,27.515722,sphere/spheroid,,Synechocystis,Merismopediaceae,Synechococcales,,Cyanobacteria,Bacteria,genus,N5451,3685717.0,47.639318,3429.768068,87.139429,2.018068
taxid1144,3.741657,3.741657,27.427781,43.982297,sphere/spheroid,Synechocystis sp. PCC 6701,Geminocystis,Chroococcaceae,Chroococcales,,Cyanobacteria,Bacteria,genus,N5887,4344739.0,33.848436,3989.528436,84.265219,2.028436
taxid1148,2.397916,2.397916,7.219388,18.064158,sphere/spheroid,Synechocystis sp. PCC 6803,Synechocystis,Merismopediaceae,Synechococcales,,Cyanobacteria,Bacteria,species,N5888,3678121.0,47.631437,3418.285722,87.160151,2.000008
taxid1152,2.029875,2.029875,4.379319,12.944595,sphere/spheroid,,Pseudanabaena,Pseudanabaenaceae,Pseudanabaenales,,Cyanobacteria,Bacteria,genus,N911,4756803.0,44.395336,4371.145336,83.924587,1.478669


In [40]:
df_.shape

(94, 19)

Export data

In [41]:
df_.to_csv('place/fine_cyanobacteria.tsv', sep = '\t')

### Just species level

In [42]:
placed = tree.copy()
inserted = []
for idx, row in df.query('rank == "species"').iterrows():
    node = placed.find(row['node'])
    leaf = TreeNode(name=idx, length=node.length + node.median)
    node.parent.append(leaf)
    inserted.append(idx)
placed = placed.shear(inserted)
placed.prune()
placed.write('place/species_cyanobacteria.nwk')
df_ = df.loc[inserted].copy()
for key in keys:
    df_[key] = df_['node'].map(dfm[key].to_dict())
df_.to_csv('place/species_cyanobacteria.tsv', sep='\t')

### Higher ranks

In [43]:
for rank in 'genus', 'family', 'order', 'class', 'phylum':
    df = pd.read_table(f'../preprocess/annot/{rank}_cyanobacteria.tsv', index_col=0)
    df.index = 'txid' + df.index.astype(str)
    df['rank'] = None
    df['node'] = None
    for idx, row in df.iterrows():
        taxa = set(dft[dft[rank] == row[rank]].index)
        if not taxa:
            continue
        node = lca2(tree, taxa)
        if node.taxa.difference(taxa):
            continue
        df.at[idx, 'node'] = node.name
        df.at[idx, 'rank'] = rank
    df = df.dropna(subset=['node'])
    placed = tree.copy()
    inserted = []
    for idx, row in df.iterrows():
        node = placed.find(row['node'])
        leaf = TreeNode(name=idx, length=node.length + node.median)
        node.parent.append(leaf)
        inserted.append(idx)
    placed = placed.shear(inserted)
    placed.prune()
    placed.write(f'place/{rank}.nwk')
    df = df.loc[inserted]
    for key in keys:
        df[key] = df['node'].map(dfm[key].to_dict())
    df.to_csv(f'place/{rank}_cyanobacteria.tsv', sep='\t')
    print(f'{rank}: {df.shape[0]}')

genus: 27
family: 17
order: 5
class: 1
phylum: 1
