# Match cell size with NCBI taxonomy

In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import gmean
import time
import re
from bs4 import BeautifulSoup
import requests

In [2]:
# Interactive matplotlib plot
%matplotlib notebook

In [3]:
import lzma

### Cell size data

In [4]:
df = pd.read_table('data_BMvol1.tsv', sep = '\t', index_col = 0 )
df.shape

(103, 6)

### Reference taxonomy

Read NCBI taxonomy database

In [5]:
dump = {}
with lzma.open('nodes.dmp.xz', 'rb') as f:
    for line in f.read().decode().splitlines():
        x = line.rstrip('\r\n').replace('\t|', '').split('\t')
        dump[x[0]] = {'parent': x[1], 'rank': x[2], 'name': '', 'children': set()}
with lzma.open('names.dmp.xz', 'rb') as f:
    for line in f.read().decode().splitlines():
        x = line.rstrip('\r\n').replace('\t|', '').split('\t')
        if x[3] == 'scientific name':
            dump[x[0]]['name'] = x[1]
for tid in dump:
    pid = dump[tid]['parent']
    if tid != pid:
        dump[pid]['children'].add(tid)

In [6]:
len(dump)

2375861

### Match by taxon name

Match species name

In [7]:
spnames = {v['name']: k for k, v in dump.items() if v['rank'] == 'species'}

In [8]:
df['name'].astype(str).isin(spnames).value_counts()

True     80
False    23
Name: name, dtype: int64

Add species taxID

In [9]:
df['sptid'] = df['name'].map(spnames)

Check the remaining

In [10]:
tmp = df[df['sptid'].isna()]
tmp.shape

(23, 7)

Match subspecies name

In [11]:
sspnames = {v['name']: k for k, v in dump.items() if v['rank'] == 'subspecies'}

In [12]:
tmp['name'].astype(str).isin(sspnames).value_counts()

False    23
Name: name, dtype: int64

Use [NCBI taxonomy browser](https://www.ncbi.nlm.nih.gov/taxonomy/).

In [13]:
# new_spnames = {}
# for i, row in tmp.iterrows():
#     term = row['name']
#     url = 'https://www.ncbi.nlm.nih.gov/taxonomy/?term=' + term.replace(' ', '+') + '&report=taxon&format=text'
#     resp = requests.get(url)
#     soup = BeautifulSoup(resp.content, "html.parser")
#     name = soup.find('pre').text.strip()
#     try:
#         df.at[i, 'sptid'] = spnames[name]
#         new_spnames[i] = [term, name]
#     except:
#         print(f'\tIndex: {i} not in dump taxID')

	Index: 14 not in dump taxID
	Index: 22 not in dump taxID
	Index: 23 not in dump taxID
	Index: 50 not in dump taxID
	Index: 69 not in dump taxID
	Index: 86 not in dump taxID


Check assigned new names

In [14]:
# for k, v in new_spnames.items():
#     df.at[k, 'name'] = v[1]

Check the remaining

In [15]:
# df[df['sptid'].isna()]

Unnamed: 0,name,shape,length,width,volume,surface,sptid
14,Pyrodietium broekü,disk,0.866025,0.2,0.025113,0.54414,
22,Sulfurococcus mirabilis,spherical,1.414214,1.414214,1.480961,6.283185,
23,Sulfurococcus yellowstonensis,spherical,0.894427,0.894427,0.374657,2.513274,
50,Methanocaldococcus jannaschü,spherical,1.5,1.5,1.767146,7.068583,
69,Methanopianus funicola,disk,1.870829,1.414214,2.19821,8.311873,
86,Methanohalophilus mahü,spherical,1.0,1.0,0.523599,3.141593,


Manually assign the above entries by looking in the [NCBI taxonomy browser.](https://www.ncbi.nlm.nih.gov/taxonomy/) (I made sure the new `sptid` is present in `dump`).

<font color = 'red'> Beware! the iindex may change if you re-run the code after some time! Why? Because NCBI taxonomy is updated frequently. Ran on 12/8/2023 </font>

Could not find:

* `Sulfurococcus mirabilis`
* `Sulfurococcus yellowstonensis`
* `Methanopianus funicola`

In [16]:
# df.at[14, 'sptid'], df.at[14, 'name'] = '35616', 'Pyrodictium brockii'
# df.at[50, 'sptid'], df.at[50, 'name'] = '2190', 'Methanocaldococcus jannaschii'
# df.at[86, 'sptid'], df.at[86, 'name'] = '2190', 'Methanohalophilus mahii'

Check the remaining

In [17]:
df[df['sptid'].isna()]

Unnamed: 0,name,shape,length,width,volume,surface,sptid
22,Sulfurococcus mirabilis,spherical,1.414214,1.414214,1.480961,6.283185,
23,Sulfurococcus yellowstonensis,spherical,0.894427,0.894427,0.374657,2.513274,
69,Methanopianus funicola,disk,1.870829,1.414214,2.19821,8.311873,


Drop entries that could not be assigned

In [18]:
# df.drop([22, 23, 69], inplace = True)

## Mean cell size

Check duplicate species

In [19]:
# df['sptid'].value_counts().value_counts()

1    89
2     4
3     1
Name: sptid, dtype: int64

Group organisms by species taxID and calculate geometric mean per species

In [20]:
# cols = ['length', 'width', 'volume', 'surface']

In [21]:
# dfr = df.groupby('sptid')[cols].agg(gmean)

Add shape and species name

In [22]:
# shapes = dict(df[['sptid', 'shape']].drop_duplicates('sptid').values)
# dfr['shape'] = dfr.index.map(shapes)

This step may change the name of some entries according to the information available on `dump`

In [23]:
# dfr['species'] = dfr.index.to_series().apply(lambda x: dump[str(x)]['name'])

Organize

In [24]:
# dfr = dfr.loc[sorted(dfr.index, key=int)]

In [25]:
# dfr.index.names = ['taxid']

In [26]:
# dfr.shape

(94, 6)

Fill more ranks

In [27]:
# ranks = ['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom']

In [28]:
# res = {x: {} for x in ranks[1:]}

In [29]:
# rankset = set(ranks[1:])

In [30]:
# for tid in dfr.index.astype(str):
#     this = tid
#     while True:
#         rank = dump[this]['rank']
#         if rank in rankset:
#             res[rank][tid] = dump[this]['name']
#         parent = dump[this]['parent']
#         if this == parent:
#             break
#         this = parent

In [31]:
# for rank in ranks[1:]:
#     dfr[rank] = dfr.index.map(res[rank])

In [32]:
# # Correct entries with no species
# dfr['species'] = dfr['species'].astype(str).apply(lambda x: x if len(x.split(' ')) > 1 else np.NaN)

In [33]:
# dfr.head()

Unnamed: 0_level_0,length,width,volume,surface,shape,species,genus,family,order,class,phylum,superkingdom
taxid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2161,12.247449,0.707107,4.717002,27.20699,rod,Methanobacterium bryantii,Methanobacterium,Methanobacteriaceae,Methanobacteriales,Methanobacteria,Euryarchaeota,Archaea
2162,5.477226,0.565685,1.329186,9.733869,rod,Methanobacterium formicicum,Methanobacterium,Methanobacteriaceae,Methanobacteriales,Methanobacteria,Euryarchaeota,Archaea
2163,3.872983,0.632456,1.150503,7.695299,rod,Methanobacterium ivanovii,Methanobacterium,Methanobacteriaceae,Methanobacteriales,Methanobacteria,Euryarchaeota,Archaea
2171,3.535534,0.5,0.661476,5.553604,rod,Methanobacterium palustre,Methanobacterium,Methanobacteriaceae,Methanobacteriales,Methanobacteria,Euryarchaeota,Archaea
2173,1.0,0.648074,0.258608,2.035985,rod,Methanobrevibacter smithii,Methanobrevibacter,Methanobacteriaceae,Methanobacteriales,Methanobacteria,Euryarchaeota,Archaea


Output

In [34]:
# dfr.to_csv('annot/BM_vol1.tsv', sep = '\t')

Statistics

In [35]:
# for rank in ranks:
#     print(rank, dfr[rank].nunique())

species 94
genus 43
family 17
order 9
class 5
phylum 2
superkingdom 1


## Collapse to higher ranks

In [36]:
# def get_rank(tid, rank):
#     if tid == '1':
#         return None
#     if dump[tid]['rank'] == rank:
#         return tid
#     return get_rank(dump[tid]['parent'], rank)

In [37]:
# for rank in ranks[1:]:
#     #dfr = df.copy().drop(columns=['taxid'])
#     dfr = df.copy()
#     dfr['taxid'] = dfr['sptid'].apply(lambda x: get_rank(x, rank))
#     dfr = dfr.dropna(subset=['taxid'])
#     dfr = dfr.groupby('taxid')[cols].agg(gmean)
#     dfr = dfr.loc[sorted(dfr.index, key=int)]
#     ranks_ = ranks[ranks.index(rank):]
#     res = {x: {} for x in ranks_}
#     rankset = set(ranks_)
#     for tid in dfr.index.astype(str):
#         this = tid
#         while True:
#             rank_ = dump[this]['rank']
#             if rank_ in rankset:
#                 res[rank_][tid] = dump[this]['name']
#             parent = dump[this]['parent']
#             if this == parent:
#                 break
#             this = parent
#     for rank_ in ranks_:
#         dfr[rank_] = dfr.index.map(res[rank_])
#     dfr.to_csv(f'annot/{rank}_BM_vol1.tsv', sep='\t')
#     print(f'{rank}: {dfr.shape[0]}')

genus: 43
family: 17
order: 9
class: 5
phylum: 2
superkingdom: 1
