# Match cell size with NCBI taxonomy

In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import gmean
from Bio import Entrez
import time
import re

In [2]:
# Interactive matplotlib plot
%matplotlib notebook

In [3]:
import lzma

### Cell size data

In [4]:
df = pd.read_table('data_manually.tsv')
df.head()

Unnamed: 0.1,Unnamed: 0,species,length,width,Shape,reference,volume,surface
0,0,Campylobacter jejuni,1.118034,1.118034,sphere/spiral,"Rodrigues, R.C., Pocheron, AL., Hernould, M. e...",0.731752,3.926991
1,1,Listeria monocytogenes,1.414214,1.0,rod,"Abdollah Jamshidi, Tayebeh Zeinali, ""Significa...",0.848921,4.442883
2,2,Staphylococcus aureus,1.0,1.0,sphere,"Monteiro, J. M., Fernandes, P. B., Vaz, F., Pe...",0.523599,3.141593
3,3,Streptococcus pneumoniae,0.790569,0.790569,sphere,"Patterson MJ. Streptococcus. In: Baron S, edit...",0.258713,1.963495
4,4,Campylobacter coli,0.316228,0.316228,sphere/rod,"Lansing M. Prescott, John P. Harley, and Donal...",0.016558,0.314159


In [5]:
df.shape

(54, 8)

In [6]:
# Reformat dataframe
df['key'] = np.arange(df.shape[0])
df.set_index('key')
df.drop(columns = ['Unnamed: 0', 'key'], inplace = True)
df.rename(columns = {'species': 'name', 'Shape': 'shape'}, inplace = True)
df.head()

Unnamed: 0,name,length,width,shape,reference,volume,surface
0,Campylobacter jejuni,1.118034,1.118034,sphere/spiral,"Rodrigues, R.C., Pocheron, AL., Hernould, M. e...",0.731752,3.926991
1,Listeria monocytogenes,1.414214,1.0,rod,"Abdollah Jamshidi, Tayebeh Zeinali, ""Significa...",0.848921,4.442883
2,Staphylococcus aureus,1.0,1.0,sphere,"Monteiro, J. M., Fernandes, P. B., Vaz, F., Pe...",0.523599,3.141593
3,Streptococcus pneumoniae,0.790569,0.790569,sphere,"Patterson MJ. Streptococcus. In: Baron S, edit...",0.258713,1.963495
4,Campylobacter coli,0.316228,0.316228,sphere/rod,"Lansing M. Prescott, John P. Harley, and Donal...",0.016558,0.314159


### Reference taxonomy

Read NCBI taxonomy database.

In [7]:
dump = {}
with lzma.open('nodes.dmp.xz', 'rb') as f:
    for line in f.read().decode().splitlines():
        x = line.rstrip('\r\n').replace('\t|', '').split('\t')
        dump[x[0]] = {'parent': x[1], 'rank': x[2], 'name': '', 'children': set()}
with lzma.open('names.dmp.xz', 'rb') as f:
    for line in f.read().decode().splitlines():
        x = line.rstrip('\r\n').replace('\t|', '').split('\t')
        if x[3] == 'scientific name':
            dump[x[0]]['name'] = x[1]
for tid in dump:
    pid = dump[tid]['parent']
    if tid != pid:
        dump[pid]['children'].add(tid)

In [8]:
len(dump)

2375861

### Match by taxon name

Match species name

In [9]:
spnames = {v['name']: k for k, v in dump.items() if v['rank'] == 'species'}

In [10]:
df['name'].astype(str).isin(spnames).value_counts()

True     49
False     5
Name: name, dtype: int64

Add species TaxID.

In [11]:
df['sptid'] = df['name'].map(spnames)

Check the remaining

In [12]:
df[df['sptid'].isna()]

Unnamed: 0,name,length,width,shape,reference,volume,surface,sptid
40,Pelagibacter ubique,0.573847,0.154919,rod,"Rappé, M., Connon, S., Vergin, K. et al. Culti...",0.009843368,0.279287,
44,Prochlorococcus,0.591608,0.591608,sphere,"Partensky, F., Hess, W. R., & Vaulot, D. (1999...",0.1084178,1.099557,
46,Cristispira anodontae,62.225397,0.979796,spiral,"Spirochaeta anodontae Keysselitz, Arb. a. d. k...",46.67059,191.537213,
47,Cristispira pinnae,24.494897,1.224745,spiral,"Spirochaete pinnae Gonder, Cent. f. Bakt., I A...",28.37641,94.24778,
53,Thiomargarita magnifica,9720.0,30.0,rod,https://doi.org/10.1101/2022.02.16.480423,6863595.0,916088.417787,


Manually assign the above entries by looking in the [NCBI taxonomy browser.](https://www.ncbi.nlm.nih.gov/taxonomy/)

In [13]:
df.at[40, 'sptid'] = '198252'
df.at[44, 'sptid'] = '1218'
df.at[46, 'sptid'] = '44757'
df.at[47, 'sptid'] = '44757'
# For Thiomargarita magnifica, assign genus taxID
df.at[53, 'sptid'] = '90372'

In [14]:
def get_species_name(tid):
    if dump[tid]['rank'] == 'species':
        return dump[tid]['name']
    elif dump[tid]['rank'] == 'genus':
        return dump[tid]['name']
    return get_species_name(dump[tid]['parent'])

In [15]:
df['species'] = df['sptid'].astype(str).apply(get_species_name)

In [16]:
df.drop(columns = ['name', 'reference'], inplace = True)

In [17]:
df.head()

Unnamed: 0,length,width,shape,volume,surface,sptid,species
0,1.118034,1.118034,sphere/spiral,0.731752,3.926991,197,Campylobacter jejuni
1,1.414214,1.0,rod,0.848921,4.442883,1639,Listeria monocytogenes
2,1.0,1.0,sphere,0.523599,3.141593,1280,Staphylococcus aureus
3,0.790569,0.790569,sphere,0.258713,1.963495,1313,Streptococcus pneumoniae
4,0.316228,0.316228,sphere/rod,0.016558,0.314159,195,Campylobacter coli


### Mean cell size

Check duplicate species/genus

In [18]:
df['sptid'].value_counts().value_counts()

1    52
2     1
Name: sptid, dtype: int64

Group organisms by species TaxID and calculate geometric mean per species/genus

In [19]:
cols = ['length', 'width', 'volume', 'surface']

In [20]:
dfr = df.groupby('sptid')[cols].agg(gmean)

Add shape and species name.

In [21]:
shapes = dict(df[['sptid', 'shape']].drop_duplicates('sptid').values)
dfr['shape'] = dfr.index.map(shapes)

In [22]:
dfr['species'] = dfr.index.to_series().apply(lambda x: dump[str(x)]['name'])

Organize

In [23]:
dfr = dfr.loc[sorted(dfr.index, key=int)]

In [24]:
dfr.index.names = ['taxid']

In [25]:
dfr.shape

(53, 6)

In [26]:
dfr.head()

Unnamed: 0_level_0,length,width,volume,surface,shape,species
taxid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
160,10.954451,0.134164,0.154233,4.617179,spiral,Treponema pallidum
173,12.649111,0.1,0.099084,3.973835,spiral,Leptospira interrogans
195,0.316228,0.316228,0.016558,0.314159,sphere/rod,Campylobacter coli
197,1.118034,1.118034,0.731752,3.926991,sphere/spiral,Campylobacter jejuni
199,4.0,0.707107,1.478236,8.885766,spiral,Campylobacter concisus


Fill more ranks

In [27]:
ranks = ['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom']

In [28]:
res = {x: {} for x in ranks[1:]}

In [29]:
rankset = set(ranks[1:])

In [30]:
for tid in dfr.index.astype(str):
    this = tid
    while True:
        rank = dump[this]['rank']
        if rank in rankset:
            res[rank][tid] = dump[this]['name']
        parent = dump[this]['parent']
        if this == parent:
            break
        this = parent

In [31]:
for rank in ranks[1:]:
    dfr[rank] = dfr.index.map(res[rank])

In [32]:
# Correct entries with no species
dfr['species'] = dfr['species'].astype(str).apply(lambda x: x if len(x.split(' ')) > 1 else np.NaN)

In [33]:
dfr.head()

Unnamed: 0_level_0,length,width,volume,surface,shape,species,genus,family,order,class,phylum,superkingdom
taxid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
160,10.954451,0.134164,0.154233,4.617179,spiral,Treponema pallidum,Treponema,Treponemataceae,Spirochaetales,Spirochaetia,Spirochaetes,Bacteria
173,12.649111,0.1,0.099084,3.973835,spiral,Leptospira interrogans,Leptospira,Leptospiraceae,Leptospirales,Spirochaetia,Spirochaetes,Bacteria
195,0.316228,0.316228,0.016558,0.314159,sphere/rod,Campylobacter coli,Campylobacter,Campylobacteraceae,Campylobacterales,Epsilonproteobacteria,Proteobacteria,Bacteria
197,1.118034,1.118034,0.731752,3.926991,sphere/spiral,Campylobacter jejuni,Campylobacter,Campylobacteraceae,Campylobacterales,Epsilonproteobacteria,Proteobacteria,Bacteria
199,4.0,0.707107,1.478236,8.885766,spiral,Campylobacter concisus,Campylobacter,Campylobacteraceae,Campylobacterales,Epsilonproteobacteria,Proteobacteria,Bacteria


Output

In [34]:
dfr.to_csv('annot/species_common_extremes.tsv', sep = '\t')

Statistics

In [35]:
for rank in ranks:
    print(rank, dfr[rank].nunique())

species 50
genus 33
family 30
order 22
class 11
phylum 8
superkingdom 2


### Collapse to higher ranks

In [36]:
df.head()

Unnamed: 0,length,width,shape,volume,surface,sptid,species
0,1.118034,1.118034,sphere/spiral,0.731752,3.926991,197,Campylobacter jejuni
1,1.414214,1.0,rod,0.848921,4.442883,1639,Listeria monocytogenes
2,1.0,1.0,sphere,0.523599,3.141593,1280,Staphylococcus aureus
3,0.790569,0.790569,sphere,0.258713,1.963495,1313,Streptococcus pneumoniae
4,0.316228,0.316228,sphere/rod,0.016558,0.314159,195,Campylobacter coli


In [37]:
def get_rank(tid, rank):
    if tid == '1':
        return None
    if dump[tid]['rank'] == rank:
        return tid
    return get_rank(dump[tid]['parent'], rank)

In [38]:
for rank in ranks[1:]:
    #dfr = df.copy().drop(columns=['taxid'])
    dfr = df.copy()
    dfr['taxid'] = dfr['sptid'].apply(lambda x: get_rank(x, rank))
    dfr = dfr.dropna(subset=['taxid'])
    dfr = dfr.groupby('taxid')[cols].agg(gmean)
    dfr = dfr.loc[sorted(dfr.index, key=int)]
    ranks_ = ranks[ranks.index(rank):]
    res = {x: {} for x in ranks_}
    rankset = set(ranks_)
    for tid in dfr.index.astype(str):
        this = tid
        while True:
            rank_ = dump[this]['rank']
            if rank_ in rankset:
                res[rank_][tid] = dump[this]['name']
            parent = dump[this]['parent']
            if this == parent:
                break
            this = parent
    for rank_ in ranks_:
        dfr[rank_] = dfr.index.map(res[rank_])
    dfr.to_csv(f'annot/{rank}_common_extremes.tsv', sep='\t')
    print(f'{rank}: {dfr.shape[0]}')

genus: 33
family: 30
order: 22
class: 11
phylum: 8
superkingdom: 2
