# Match cell size with NCBI taxonomy

In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import gmean
import time
import re
from bs4 import BeautifulSoup
import requests

In [2]:
# Interactive matplotlib plot
%matplotlib notebook

In [3]:
import lzma

### Cell size data

In [4]:
df = pd.read_table('data_BMvol4.tsv', sep = '\t', index_col = 0 )
df.shape

(346, 6)

### Reference taxonomy

Read NCBI taxonomy database

In [5]:
dump = {}
with lzma.open('nodes.dmp.xz', 'rb') as f:
    for line in f.read().decode().splitlines():
        x = line.rstrip('\r\n').replace('\t|', '').split('\t')
        dump[x[0]] = {'parent': x[1], 'rank': x[2], 'name': '', 'children': set()}
with lzma.open('names.dmp.xz', 'rb') as f:
    for line in f.read().decode().splitlines():
        x = line.rstrip('\r\n').replace('\t|', '').split('\t')
        if x[3] == 'scientific name':
            dump[x[0]]['name'] = x[1]
for tid in dump:
    pid = dump[tid]['parent']
    if tid != pid:
        dump[pid]['children'].add(tid)

In [6]:
len(dump)

2375861

### Match by taxon name

Match species name

In [7]:
spnames = {v['name']: k for k, v in dump.items() if v['rank'] == 'species'}

In [8]:
df['name'].astype(str).isin(spnames).value_counts()

True     261
False     85
Name: name, dtype: int64

Add species taxID

In [9]:
df['sptid'] = df['name'].map(spnames)

In [10]:
tmp = df[df['sptid'].isna()]
tmp.shape

(85, 7)

Match subspecies name

In [11]:
sspnames = {v['name']: k for k, v in dump.items() if v['rank'] == 'subspecies'}

In [12]:
tmp['name'].astype(str).isin(sspnames).value_counts()

False    85
Name: name, dtype: int64

Use [NCBI taxonomy browser](https://www.ncbi.nlm.nih.gov/taxonomy/).

In [13]:
# new_spnames = {}
# for i, row in tmp.iterrows():
#     term = row['name']
#     url = 'https://www.ncbi.nlm.nih.gov/taxonomy/?term=' + term.replace(' ', '+') + '&report=taxon&format=text'
#     resp = requests.get(url)
#     soup = BeautifulSoup(resp.content, "html.parser")
#     name = soup.find('pre').text.strip()
#     try:
#         df.at[i, 'sptid'] = spnames[name]
#         new_spnames[i] = [term, name]
#     except:
#         print(f'\tIndex: {i} not in dump taxID')

	Index: 28 not in dump taxID
	Index: 33 not in dump taxID
	Index: 38 not in dump taxID
	Index: 60 not in dump taxID
	Index: 63 not in dump taxID
	Index: 72 not in dump taxID
	Index: 84 not in dump taxID
	Index: 90 not in dump taxID
	Index: 113 not in dump taxID
	Index: 117 not in dump taxID
	Index: 132 not in dump taxID
	Index: 134 not in dump taxID
	Index: 139 not in dump taxID
	Index: 140 not in dump taxID
	Index: 141 not in dump taxID
	Index: 146 not in dump taxID
	Index: 155 not in dump taxID
	Index: 158 not in dump taxID
	Index: 163 not in dump taxID
	Index: 178 not in dump taxID
	Index: 179 not in dump taxID
	Index: 197 not in dump taxID
	Index: 199 not in dump taxID
	Index: 210 not in dump taxID
	Index: 221 not in dump taxID
	Index: 223 not in dump taxID
	Index: 337 not in dump taxID
	Index: 340 not in dump taxID


Check assigned new names

In [14]:
# for k, v in new_spnames.items():
#     df.at[k, 'name'] = v[1]

Check the remaining

In [15]:
# df[df['sptid'].isna()]

Unnamed: 0,name,shape,length,width,volume,surface,sptid
28,Spirochaeta caldaria,helical,25.980762,0.244949,1.220467,19.992973,
33,Spirochaeta stenostrepta,helical,25.980762,0.244949,1.220467,19.992973,
38,Borrelia baltazardii,helical,12.0,0.424264,1.676467,15.994379,
60,Borrelia spielmanii,helical,9.486833,0.316228,0.736815,9.424778,
63,Borrelia turdi,helical,9.486833,0.316228,0.736815,9.424778,
72,Treponema carateum,helical,10.954451,0.18,0.27723,6.194596,
84,Treponema scoliodontus,helical,9.486833,0.316228,0.736815,9.424778,
90,Treponema orale,helical,9.797959,0.158114,0.191348,4.866934,
113,Leptospira genomospecies,rod,8.485281,0.1,0.066381,2.66573,
117,Mycoplasma agalactiae,spherical,0.489898,0.158114,0.008584,0.243347,


Manually assign the above entries by looking at [NCBI taxonomy browser.](https://www.ncbi.nlm.nih.gov/taxonomy/) (I made sure the new `sptid` is present in `dump`).

Could not find the following:

* `Borrelia baltazardii`
* `Treponema carateum`
* `Treponema scoliodontus`
* `Treponema orale`
* `Mycoplasma haemotarandirangiferis`
* `Planctomyces guttaeformis`
* `Planctomyces stranskae`

<!-- <font color = 'red'> Beware! the iindex may change if you re-run the code after some time! -->

In [16]:
# df.at[28, 'sptid'], df.at[28, 'name'] = '215591', 'Gracilinema caldarium'
# df.at[33, 'sptid'], df.at[33, 'name'] = '152', 'Zuelzera stenostrepta'
# df.at[60, 'sptid'], df.at[60, 'name'] = '88916', 'Borreliella spielmanii'
# df.at[63, 'sptid'], df.at[63, 'name'] = '57863', 'Borreliella turdi'
# df.at[113, 'sptid'], df.at[113, 'name'] = '100053', 'Leptospira alexanderi'
# df.at[117, 'sptid'], df.at[117, 'name'] = '2110', 'Mycoplasmopsis agalactiae'
# df.at[132, 'sptid'], df.at[132, 'name'] = '2113', 'Mycoplasmopsis californica'
# df.at[134, 'sptid'], df.at[134, 'name'] = '29555', 'Mycoplasmopsis canis'
# df.at[139, 'sptid'], df.at[139, 'name'] = '114880', 'Mycoplasmopsis columbinasalis'
# df.at[140, 'sptid'], df.at[140, 'name'] = '114881', 'Mycoplasmopsis columbina'
# df.at[141, 'sptid'], df.at[140, 'name'] = '171282', 'Mycoplasmopsis columboralis'
# df.at[146, 'sptid'], df.at[146, 'name'] = '171284', 'Mycoplasmopsis cynos'
# df.at[155, 'sptid'], df.at[155, 'name'] = '114881', 'Mycoplasmopsis felis'
# df.at[158, 'sptid'], df.at[158, 'name'] = '29556', 'Mycoplasmopsis gallinacea'
# df.at[163, 'sptid'], df.at[163, 'name'] = '171285', 'Mycoplasmopsis glycophila'
# df.at[178, 'sptid'], df.at[178, 'name'] = '114885', 'Mycoplasmopsis maculosa'
# df.at[179, 'sptid'], df.at[179, 'name'] = '29561', 'Mycoplasmopsis meleagridis'
# df.at[197, 'sptid'], df.at[197, 'name'] = '55604', 'Mycoplasmopsis primatum'
# df.at[199, 'sptid'], df.at[199, 'name'] = '2107', 'Mycoplasmopsis pulmonis'
# df.at[210, 'sptid'], df.at[210, 'name'] = '171291', 'Mycoplasmopsis verecunda'
# df.at[223, 'sptid'], df.at[223, 'name'] = '219290', 'Mycoplasma vulturii'

Check the remaining

In [17]:
# df[df['sptid'].isna()].shape

(7, 7)

In [18]:
# df[df['sptid'].isna()]

Unnamed: 0,name,shape,length,width,volume,surface,sptid
38,Borrelia baltazardii,helical,12.0,0.424264,1.676467,15.994379,
72,Treponema carateum,helical,10.954451,0.18,0.27723,6.194596,
84,Treponema scoliodontus,helical,9.486833,0.316228,0.736815,9.424778,
90,Treponema orale,helical,9.797959,0.158114,0.191348,4.866934,
221,Mycoplasma haemotarandirangiferis,rod,0.489898,0.158114,0.008584,0.243347,
337,Planctomyces guttaeformis,oval,2.949576,1.222702,2.984751,11.330005,
340,Planctomyces stranskae,oval,3.24037,1.596872,5.423648,16.256035,


In [19]:
# df[df['sptid'].isna()].index

Int64Index([38, 72, 84, 90, 221, 337, 340], dtype='int64')

Drop entries that could not be assigned

In [20]:
# df.drop([38, 72, 84, 90, 221, 337, 340], inplace = True)

## Mean cell size

Check duplicate species

In [21]:
# df['sptid'].value_counts().value_counts()

1    327
2      6
Name: sptid, dtype: int64

Group organisms by species taxID and calculate geometric mean per species

In [22]:
# cols = ['length', 'width', 'volume', 'surface']

In [23]:
# dfr = df.groupby('sptid')[cols].agg(gmean)

Add shape and species name

In [24]:
# shapes = dict(df[['sptid', 'shape']].drop_duplicates('sptid').values)
# dfr['shape'] = dfr.index.map(shapes)

This step may change the name of some entries according to the information available on `dump`

In [25]:
# dfr['species'] = dfr.index.to_series().apply(lambda x: dump[str(x)]['name'])

Organize

In [26]:
# dfr = dfr.loc[sorted(dfr.index, key=int)]

In [27]:
# dfr.index.names = ['taxid']

In [28]:
# dfr.shape

(333, 6)

Fill more ranks

In [29]:
# ranks = ['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom']

In [30]:
# res = {x: {} for x in ranks[1:]}

In [31]:
# rankset = set(ranks[1:])

In [32]:
# for tid in dfr.index.astype(str):
#     this = tid
#     while True:
#         rank = dump[this]['rank']
#         if rank in rankset:
#             res[rank][tid] = dump[this]['name']
#         parent = dump[this]['parent']
#         if this == parent:
#             break
#         this = parent

In [33]:
# for rank in ranks[1:]:
#     dfr[rank] = dfr.index.map(res[rank])

In [34]:
# # Correct entries with no species
# dfr['species'] = dfr['species'].astype(str).apply(lambda x: x if len(x.split(' ')) > 1 else np.NaN)

In [35]:
# dfr.head()

Unnamed: 0_level_0,length,width,volume,surface,shape,species,genus,family,order,class,phylum,superkingdom
taxid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
114,2.04939,2.04939,4.506844,13.194689,spherical,Gemmata obscuriglobus,Gemmata,Gemmataceae,Gemmatales,Planctomycetia,Planctomycetes,Bacteria
119,1.122497,1.122497,0.74055,3.958407,spherical,Rubinisphaera brasiliensis,Rubinisphaera,Planctomycetaceae,Planctomycetales,Planctomycetia,Planctomycetes,Bacteria
120,1.284523,1.284523,1.109748,5.183628,spherical,Planctopirus limnophila,Planctopirus,Planctomycetaceae,Planctomycetales,Planctomycetia,Planctomycetes,Bacteria
122,1.224745,1.224745,0.961912,4.712389,spherical,Gimesia maris,Gimesia,Planctomycetaceae,Planctomycetales,Planctomycetia,Planctomycetes,Bacteria
124,1.414214,1.024695,0.884579,4.5526,oval,Blastopirellula marina,Blastopirellula,Pirellulaceae,Pirellulales,Planctomycetia,Planctomycetes,Bacteria


Output

In [36]:
# dfr.to_csv('annot/BM_vol4.tsv', sep = '\t')

Statistics

In [37]:
# for rank in ranks:
#     print(rank, dfr[rank].nunique())

species 333
genus 59
family 32
order 23
class 14
phylum 11
superkingdom 1


## Collapse to higher ranks

In [38]:
# def get_rank(tid, rank):
#     if tid == '1':
#         return None
#     if dump[tid]['rank'] == rank:
#         return tid
#     return get_rank(dump[tid]['parent'], rank)

In [39]:
# for rank in ranks[1:]:
#     #dfr = df.copy().drop(columns=['taxid'])
#     dfr = df.copy()
#     dfr['taxid'] = dfr['sptid'].apply(lambda x: get_rank(x, rank))
#     dfr = dfr.dropna(subset=['taxid'])
#     dfr = dfr.groupby('taxid')[cols].agg(gmean)
#     dfr = dfr.loc[sorted(dfr.index, key=int)]
#     ranks_ = ranks[ranks.index(rank):]
#     res = {x: {} for x in ranks_}
#     rankset = set(ranks_)
#     for tid in dfr.index.astype(str):
#         this = tid
#         while True:
#             rank_ = dump[this]['rank']
#             if rank_ in rankset:
#                 res[rank_][tid] = dump[this]['name']
#             parent = dump[this]['parent']
#             if this == parent:
#                 break
#             this = parent
#     for rank_ in ranks_:
#         dfr[rank_] = dfr.index.map(res[rank_])
#     dfr.to_csv(f'annot/{rank}_BM_vol4.tsv', sep='\t')
#     print(f'{rank}: {dfr.shape[0]}')

genus: 59
family: 32
order: 23
class: 14
phylum: 11
superkingdom: 1
