In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import gmean
from Bio import Entrez
import time
import re

In [2]:
# Interactive matplotlib plot
%matplotlib notebook

In [3]:
import lzma

### Reference taxonomy

Read NCBI taxonomy database.

In [4]:
dump = {}
with lzma.open('nodes.dmp.xz', 'rb') as f:
    for line in f.read().decode().splitlines():
        x = line.rstrip('\r\n').replace('\t|', '').split('\t')
        dump[x[0]] = {'parent': x[1], 'rank': x[2], 'name': '', 'children': set()}
with lzma.open('names.dmp.xz', 'rb') as f:
    for line in f.read().decode().splitlines():
        x = line.rstrip('\r\n').replace('\t|', '').split('\t')
        if x[3] == 'scientific name':
            dump[x[0]]['name'] = x[1]
for tid in dump:
    pid = dump[tid]['parent']
    if tid != pid:
        dump[pid]['children'].add(tid)

In [5]:
len(dump)

2375861

### Data table with NCBI taxID

In [6]:
df = pd.read_table('data_cyanobacteria.tsv', index_col = 0)
df.head()

Unnamed: 0,length,taxID,spname,width,shape,volume,surface
5,1.095445,269084,Synechococcus elongatus PCC 6301,1.095445,sphere/spheroid,0.688288,3.769911
8,4.472136,118323,Oscillatoria acuminata,4.472136,sphere/spheroid,46.832098,62.831853
10,2.034699,1184,Leptolyngbya boryana,2.034699,sphere/spheroid,4.410615,13.006194
11,0.894427,59930,Cyanobium gracile,0.894427,sphere/spheroid,0.374657,2.513274
12,4.090623,113355,Geminocystis herdmanii PCC 6308,4.090623,sphere/spheroid,35.839929,52.5689


In [7]:
df.shape

(225, 7)

In [8]:
spnames = {v['name']: k for k, v in dump.items() if v['rank'] == 'species'}

In [9]:
df['spname'].astype(str).isin(spnames).value_counts()

False    169
True      56
Name: spname, dtype: int64

In [10]:
tids = df['taxID'].astype(str)

In [11]:
np.unique(tids[tids.notna()].index.astype(str).isin(dump), return_counts=True)

(array([False,  True]), array([ 56, 169]))

It is better to match by taxID

Check if these TaxIDs are in dump.

In [12]:
df['taxID_rank'] = df['taxID'].astype(str).apply(lambda x: dump[x]['rank'] if x in dump.keys() else np.NaN)
df.head()

Unnamed: 0,length,taxID,spname,width,shape,volume,surface,taxID_rank
5,1.095445,269084,Synechococcus elongatus PCC 6301,1.095445,sphere/spheroid,0.688288,3.769911,strain
8,4.472136,118323,Oscillatoria acuminata,4.472136,sphere/spheroid,46.832098,62.831853,species
10,2.034699,1184,Leptolyngbya boryana,2.034699,sphere/spheroid,4.410615,13.006194,species
11,0.894427,59930,Cyanobium gracile,0.894427,sphere/spheroid,0.374657,2.513274,species
12,4.090623,113355,Geminocystis herdmanii PCC 6308,4.090623,sphere/spheroid,35.839929,52.5689,strain


Check entries that could not be assigned

In [13]:
df[df['taxID_rank'].isna()]

Unnamed: 0,length,taxID,spname,width,shape,volume,surface,taxID_rank
52,2.397916,2174229,Synechocystis sp.,2.397916,sphere/spheroid,7.219388,18.064158,


Assign taxID manually by looking at [NCBI taxonomy browser.](https://www.ncbi.nlm.nih.gov/taxonomy/) and check if new taxID exist in `dump`

In [14]:
dump['1148']

{'parent': '2640012',
 'rank': 'species',
 'name': 'Synechocystis sp. PCC 6803',
 'children': {'1080228', '1080229', '1080230', '1111707', '1111708'}}

In [15]:
df.at[52, 'taxID'] = 1148
df.at[52, 'taxID_rank'] = 'species'

Elevate subspecies to species

In [16]:
def get_species(tid):
    if dump[tid]['rank'] == 'species':
        return tid
    elif dump[tid]['rank'] == 'genus':
        return tid
    return get_species(dump[tid]['parent'])

In [17]:
def get_species_name(tid):
    if dump[tid]['rank'] == 'species':
        return dump[tid]['name']
    elif dump[tid]['rank'] == 'genus':
        return dump[tid]['name']
    return get_species_name(dump[tid]['parent'])

In [18]:
df['sptid'] = df['taxID'].astype(str).apply(get_species)

In [19]:
df['species'] = df['taxID'].astype(str).apply(get_species_name)

In [20]:
df.drop(columns = ['spname', 'taxID', 'taxID_rank'], inplace = True)

In [21]:
df.shape

(225, 7)

In [22]:
df.head()

Unnamed: 0,length,width,shape,volume,surface,sptid,species
5,1.095445,1.095445,sphere/spheroid,0.688288,3.769911,32046,Synechococcus elongatus
8,4.472136,4.472136,sphere/spheroid,46.832098,62.831853,118323,Oscillatoria acuminata
10,2.034699,2.034699,sphere/spheroid,4.410615,13.006194,1184,Leptolyngbya boryana
11,0.894427,0.894427,sphere/spheroid,0.374657,2.513274,59930,Cyanobium gracile
12,4.090623,4.090623,sphere/spheroid,35.839929,52.5689,669359,Geminocystis herdmanii


### Mean cell size

Check duplicate species/genus

In [23]:
df['sptid'].value_counts().value_counts()

1     74
2      7
5      5
6      3
4      2
3      2
33     1
15     1
13     1
10     1
9      1
Name: sptid, dtype: int64

Group organisms by species TaxID and calculate geometric mean per species/genus

In [24]:
cols = ['length', 'width', 'volume', 'surface']

In [25]:
dfr = df.groupby('sptid')[cols].agg(gmean)

Add shape and species name.

In [26]:
shapes = dict(df[['sptid', 'shape']].drop_duplicates('sptid').values)
dfr['shape'] = dfr.index.map(shapes)

In [27]:
dfr['species'] = dfr.index.to_series().apply(lambda x: dump[str(x)]['name'])

Organize

In [28]:
dfr = dfr.loc[sorted(dfr.index, key=int)]

In [29]:
dfr.index.names = ['taxid']

In [30]:
dfr.head()

Unnamed: 0_level_0,length,width,volume,surface,shape,species
taxid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1126,3.24037,3.24037,17.814866,32.986723,sphere/spheroid,Microcystis aeruginosa
1129,1.309547,1.309547,1.175878,5.387561,sphere/spheroid,Synechococcus
1142,2.959481,2.959481,13.572041,27.515722,sphere/spheroid,Synechocystis
1144,3.741657,3.741657,27.427781,43.982297,sphere/spheroid,Synechocystis sp. PCC 6701
1148,2.397916,2.397916,7.219388,18.064158,sphere/spheroid,Synechocystis sp. PCC 6803


In [31]:
dfr.shape

(98, 6)

Fill more ranks

In [32]:
ranks = ['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom']

In [33]:
res = {x: {} for x in ranks[1:]}

In [34]:
rankset = set(ranks[1:])

In [35]:
rankset

{'class', 'family', 'genus', 'order', 'phylum', 'superkingdom'}

In [36]:
for tid in dfr.index.astype(str):
    this = tid
    while True:
        rank = dump[this]['rank']
        if rank in rankset:
            res[rank][tid] = dump[this]['name']
        parent = dump[this]['parent']
        if this == parent:
            break
        this = parent

In [37]:
for rank in ranks[1:]:
    dfr[rank] = dfr.index.map(res[rank])

In [38]:
# Correct entries with no species
dfr['species'] = dfr['species'].astype(str).apply(lambda x: x if len(x.split(' ')) > 1 else np.NaN)

In [39]:
dfr.head()

Unnamed: 0_level_0,length,width,volume,surface,shape,species,genus,family,order,class,phylum,superkingdom
taxid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1126,3.24037,3.24037,17.814866,32.986723,sphere/spheroid,Microcystis aeruginosa,Microcystis,Microcystaceae,Chroococcales,,Cyanobacteria,Bacteria
1129,1.309547,1.309547,1.175878,5.387561,sphere/spheroid,,Synechococcus,Synechococcaceae,Synechococcales,,Cyanobacteria,Bacteria
1142,2.959481,2.959481,13.572041,27.515722,sphere/spheroid,,Synechocystis,Merismopediaceae,Synechococcales,,Cyanobacteria,Bacteria
1144,3.741657,3.741657,27.427781,43.982297,sphere/spheroid,Synechocystis sp. PCC 6701,Geminocystis,Chroococcaceae,Chroococcales,,Cyanobacteria,Bacteria
1148,2.397916,2.397916,7.219388,18.064158,sphere/spheroid,Synechocystis sp. PCC 6803,Synechocystis,Merismopediaceae,Synechococcales,,Cyanobacteria,Bacteria


In [40]:
dfr.shape

(98, 12)

Output

In [41]:
dfr.to_csv('annot/species_cyanobacteria.tsv', sep = '\t')

Statistics

In [42]:
for rank in ranks:
    print(rank, dfr[rank].nunique())

species 75
genus 39
family 24
order 10
class 1
phylum 1
superkingdom 1


### Collapse to higher ranks

In [43]:
df.head()

Unnamed: 0,length,width,shape,volume,surface,sptid,species
5,1.095445,1.095445,sphere/spheroid,0.688288,3.769911,32046,Synechococcus elongatus
8,4.472136,4.472136,sphere/spheroid,46.832098,62.831853,118323,Oscillatoria acuminata
10,2.034699,2.034699,sphere/spheroid,4.410615,13.006194,1184,Leptolyngbya boryana
11,0.894427,0.894427,sphere/spheroid,0.374657,2.513274,59930,Cyanobium gracile
12,4.090623,4.090623,sphere/spheroid,35.839929,52.5689,669359,Geminocystis herdmanii


In [44]:
def get_rank(tid, rank):
    if tid == '1':
        return None
    if dump[tid]['rank'] == rank:
        return tid
    return get_rank(dump[tid]['parent'], rank)

In [45]:
for rank in ranks[1:]:
    #dfr = df.copy().drop(columns=['taxid'])
    dfr = df.copy()
    dfr['taxid'] = dfr['sptid'].apply(lambda x: get_rank(x, rank))
    dfr = dfr.dropna(subset=['taxid'])
    dfr = dfr.groupby('taxid')[cols].agg(gmean)
    dfr = dfr.loc[sorted(dfr.index, key=int)]
    ranks_ = ranks[ranks.index(rank):]
    res = {x: {} for x in ranks_}
    rankset = set(ranks_)
    for tid in dfr.index.astype(str):
        this = tid
        while True:
            rank_ = dump[this]['rank']
            if rank_ in rankset:
                res[rank_][tid] = dump[this]['name']
            parent = dump[this]['parent']
            if this == parent:
                break
            this = parent
    for rank_ in ranks_:
        dfr[rank_] = dfr.index.map(res[rank_])
    dfr.to_csv(f'annot/{rank}_cyanobacteria.tsv', sep='\t')
    print(f'{rank}: {dfr.shape[0]}')

genus: 39
family: 24
order: 10
class: 1
phylum: 1
superkingdom: 1
