# Match cell size with NCBI taxonomy

In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import gmean
import time
import re
from bs4 import BeautifulSoup
import requests

In [2]:
# Interactive matplotlib plot
%matplotlib notebook

In [3]:
import lzma

### Cell size data

In [4]:
df = pd.read_table('data_BMvol5.tsv', sep = '\t', index_col = 0 )
df.shape

(290, 6)

### Reference taxonomy

Read NCBI taxonomy database

In [5]:
dump = {}
with lzma.open('nodes.dmp.xz', 'rb') as f:
    for line in f.read().decode().splitlines():
        x = line.rstrip('\r\n').replace('\t|', '').split('\t')
        dump[x[0]] = {'parent': x[1], 'rank': x[2], 'name': '', 'children': set()}
with lzma.open('names.dmp.xz', 'rb') as f:
    for line in f.read().decode().splitlines():
        x = line.rstrip('\r\n').replace('\t|', '').split('\t')
        if x[3] == 'scientific name':
            dump[x[0]]['name'] = x[1]
for tid in dump:
    pid = dump[tid]['parent']
    if tid != pid:
        dump[pid]['children'].add(tid)

In [6]:
len(dump)

2375861

### Match by taxon name

Match species name

In [7]:
spnames = {v['name']: k for k, v in dump.items() if v['rank'] == 'species'}

In [8]:
df['name'].astype(str).isin(spnames).value_counts()

True     211
False     79
Name: name, dtype: int64

Add species taxID

In [9]:
df['sptid'] = df['name'].map(spnames)

Check the remaining

In [10]:
tmp = df[df['sptid'].isna()]
tmp.shape

(79, 7)

Match subspecies name

In [11]:
sspnames = {v['name']: k for k, v in dump.items() if v['rank'] == 'subspecies'}

In [12]:
tmp['name'].astype(str).isin(sspnames).value_counts()

False    79
Name: name, dtype: int64

Use [NCBI taxonomy browser](https://www.ncbi.nlm.nih.gov/taxonomy/).

In [13]:
# new_spnames = {}
# for i, row in tmp.iterrows():
#     term = row['name']
#     url = 'https://www.ncbi.nlm.nih.gov/taxonomy/?term=' + term.replace(' ', '+') + '&report=taxon&format=text'
#     resp = requests.get(url)
#     soup = BeautifulSoup(resp.content, "html.parser")
#     name = soup.find('pre').text.strip()
#     try:
#         df.at[i, 'sptid'] = spnames[name]
#         new_spnames[i] = [term, name]
#     except:
#         print(f'\tIndex: {i} not in dump taxID')

	Index: 12 not in dump taxID
	Index: 66 not in dump taxID
	Index: 90 not in dump taxID
	Index: 236 not in dump taxID
	Index: 247 not in dump taxID


Check assigned new names

In [14]:
# for k, v in new_spnames.items():
#     df.at[k, 'name'] = v[1]

Check the remaining

In [15]:
df[df['sptid'].isna()]

Unnamed: 0,name,shape,length,width,volume,surface,sptid
12,Bifidobacterium gallinarum,rods,1.581139,0.707107,0.528352,3.512407,
66,Mycobacterium chelonae,rods,2.44949,0.316228,0.184104,2.433467,
90,Rsukamurella spongiae,rods,2.5,1.0,1.701696,7.853982,
236,Yonghaparkia alkaliphilia,rods,0.916515,0.282843,0.051663,0.814394,
247,Propionibacterium propionicum,rods,3.872983,0.244949,0.178662,2.980376,


Manually assign the above entries by looking in the [NCBI taxonomy browser.](https://www.ncbi.nlm.nih.gov/taxonomy/) (I made sure the new `sptid` is present in `dump`).

<font color = 'red'> Beware! the iindex may change if you re-run the code after some time! </font>

In [16]:
# df.at[12, 'sptid'], df.at[12, 'name'] = '78448', 'Bifidobacterium pullorum'
# df.at[66, 'sptid'], df.at[66, 'name'] = '1774', 'Mycobacteroides chelonae'
# df.at[90, 'sptid'], df.at[90, 'name'] = '47312', 'Tsukamurella pulmonis'
# df.at[236, 'sptid'], df.at[236, 'name'] = '355930', 'Yonghaparkia alkaliphila'
# df.at[247, 'sptid'], df.at[247, 'name'] = '1750', 'Arachnia propionica'

Check the remaining

In [17]:
# df[df['sptid'].isna()].shape

(0, 7)

In [18]:
# df[df['sptid'].isna()]

Unnamed: 0,name,shape,length,width,volume,surface,sptid


### Mean cell size

Check duplicate species

In [19]:
# df['sptid'].value_counts().value_counts()

1    288
2      1
Name: sptid, dtype: int64

Group organisms by species TaxID and calculate geometric mean per species

In [20]:
# cols = ['length', 'width', 'volume', 'surface']

In [21]:
# dfr = df.groupby('sptid')[cols].agg(gmean)

Add shape and species name.

In [22]:
# shapes = dict(df[['sptid', 'shape']].drop_duplicates('sptid').values)
# dfr['shape'] = dfr.index.map(shapes)

This step may change the name of some entries according to the information available on `dump`

In [23]:
# dfr['species'] = dfr.index.to_series().apply(lambda x: dump[str(x)]['name'])

Organize

In [24]:
# dfr = dfr.loc[sorted(dfr.index, key=int)]

In [25]:
# dfr.index.names = ['taxid']

In [26]:
# dfr.shape

(289, 6)

Fill more ranks

In [27]:
# ranks = ['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom']

In [28]:
# res = {x: {} for x in ranks[1:]}

In [29]:
# rankset = set(ranks[1:])

In [30]:
# for tid in dfr.index.astype(str):
#     this = tid
#     while True:
#         rank = dump[this]['rank']
#         if rank in rankset:
#             res[rank][tid] = dump[this]['name']
#         parent = dump[this]['parent']
#         if this == parent:
#             break
#         this = parent

In [31]:
# for rank in ranks[1:]:
#     dfr[rank] = dfr.index.map(res[rank])

In [32]:
# # Correct entries with no species
# dfr['species'] = dfr['species'].astype(str).apply(lambda x: x if len(x.split(' ')) > 1 else np.NaN)

In [33]:
# dfr.head()

Unnamed: 0_level_0,length,width,volume,surface,shape,species,genus,family,order,class,phylum,superkingdom
taxid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1272,1.161895,1.161895,0.821295,4.24115,sphere,Kocuria varians,Kocuria,Micrococcaceae,Micrococcales,Actinomycetia,Actinobacteria,Bacteria
1274,1.2,1.2,0.904779,4.523893,sphere,Dermacoccus nishinomiyaensis,Dermacoccus,Dermacoccaceae,Micrococcales,Actinomycetia,Actinobacteria,Bacteria
1275,1.224745,1.224745,0.961912,4.712389,sphere,Kocuria rosea,Kocuria,Micrococcaceae,Micrococcales,Actinomycetia,Actinobacteria,Bacteria
1276,0.938083,0.938083,0.432238,2.764602,sphere,Kytococcus sedentarius,Kytococcus,Kytococcaceae,Micrococcales,Actinomycetia,Actinobacteria,Bacteria
1381,1.549193,0.774597,0.608367,3.769911,rods,Atopobium minutum,Atopobium,Atopobiaceae,Coriobacteriales,Coriobacteriia,Actinobacteria,Bacteria


Output

In [34]:
# dfr.to_csv('annot/BM_vol5_actinobacteria.tsv', sep = '\t')

Statistics

In [35]:
# for rank in ranks:
#     print(rank, dfr[rank].nunique())

species 289
genus 98
family 38
order 16
class 5
phylum 1
superkingdom 1


## Collapse to higher ranks

In [36]:
# def get_rank(tid, rank):
#     if tid == '1':
#         return None
#     if dump[tid]['rank'] == rank:
#         return tid
#     return get_rank(dump[tid]['parent'], rank)

In [37]:
# for rank in ranks[1:]:
#     #dfr = df.copy().drop(columns=['taxid'])
#     dfr = df.copy()
#     dfr['taxid'] = dfr['sptid'].apply(lambda x: get_rank(x, rank))
#     dfr = dfr.dropna(subset=['taxid'])
#     dfr = dfr.groupby('taxid')[cols].agg(gmean)
#     dfr = dfr.loc[sorted(dfr.index, key=int)]
#     ranks_ = ranks[ranks.index(rank):]
#     res = {x: {} for x in ranks_}
#     rankset = set(ranks_)
#     for tid in dfr.index.astype(str):
#         this = tid
#         while True:
#             rank_ = dump[this]['rank']
#             if rank_ in rankset:
#                 res[rank_][tid] = dump[this]['name']
#             parent = dump[this]['parent']
#             if this == parent:
#                 break
#             this = parent
#     for rank_ in ranks_:
#         dfr[rank_] = dfr.index.map(res[rank_])
#     dfr.to_csv(f'annot/{rank}_BM_vol5_actinobacteria.tsv', sep='\t')
#     print(f'{rank}: {dfr.shape[0]}')

genus: 98
family: 38
order: 16
class: 5
phylum: 1
superkingdom: 1
