# Taxonomy of all collected data (BacDive, PCC, PubMed, Bergey's manual) with different means

In [1]:
import lzma

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
from scipy.stats.mstats import gmean

## Reference taxonomy

NCBI taxonomy database

In [4]:
%%time
dump = {}
with lzma.open('../preprocess/nodes.dmp.xz', 'rb') as f:
    for line in f.read().decode().splitlines():
        x = line.rstrip('\r\n').replace('\t|', '').split('\t')
        dump[x[0]] = {'parent': x[1], 'rank': x[2], 'name': '', 'children': set()}
with lzma.open('../preprocess/names.dmp.xz', 'rb') as f:
    for line in f.read().decode().splitlines():
        x = line.rstrip('\r\n').replace('\t|', '').split('\t')
        if x[3] == 'scientific name':
            dump[x[0]]['name'] = x[1]
for tid in dump:
    pid = dump[tid]['parent']
    if tid != pid:
        dump[pid]['children'].add(tid)

CPU times: user 12.2 s, sys: 727 ms, total: 13 s
Wall time: 13 s


Load data tables with taxonomic annotations

# BacDive

In [5]:
df_bacd = pd.read_table('preprocess/data_bacdive.tsv', sep = '\t')
df_bacd.shape

(4908, 23)

Drop useless columns

In [6]:
columns = ['Unnamed: 0', 'designation_header', 'strain_number_header', 'is_type_strain_header', 'Reference', 'Length', 'Width',
          'length_ranges', 'width_ranges']
df_bacd = df_bacd.drop(columns, axis = 1)

In [7]:
df_bacd.set_index('ID', inplace = True)

Rename columns

In [8]:
df_bacd.rename(columns = {'Name': 'species', 'Cell Shape': 'shape', 'TaxID': 'taxid', }, inplace = True)

### Match species to NCBI taxonomy

In [9]:
tids = df_bacd['taxid']

In [10]:
np.unique(tids[tids.notna()].index.astype(str).isin(dump), return_counts=True)

(array([False,  True]), array([1328, 2884]))

Match species name

In [11]:
spnames = {v['name']: k for k, v in dump.items() if v['rank'] == 'species'}

In [12]:
df_bacd['species'].isin(spnames).value_counts()

species
True     4737
False     171
Name: count, dtype: int64

Add species taxid

In [13]:
df_bacd['sptid'] = df_bacd['species'].map(spnames)

Match subspecies name

In [14]:
sspnames = {v['name']: k for k, v in dump.items() if v['rank'] == 'subspecies'}

In [15]:
df_bacd[df_bacd['sptid'].isna()]['species'].isin(sspnames).value_counts()

species
False    160
True      11
Name: count, dtype: int64

Elevate subspecies to species

In [16]:
def get_species(tid):
    if dump[tid]['rank'] == 'species':
        return tid
    return get_species(dump[tid]['parent'])

Add species taxid to subspecies

In [17]:
# for idx, val in df_bacd[df_bacd['sptid'].isna()]['species'].iteritems():
for idx, val in df_bacd[df_bacd['sptid'].isna()]['species'].items():
    if val in sspnames:
        df_bacd.at[idx, 'sptid'] = get_species(sspnames[val])

Check the remaining

In [18]:
df_bacd_ = df_bacd[df_bacd['sptid'].isna()].dropna(subset=['taxid'])[['species', 'taxid']].copy()
df_bacd_['taxid'] = df_bacd_['taxid'].astype(int).astype(str)
df_bacd_.head()

Unnamed: 0_level_0,species,taxid
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
140290,Actinomyces liubingyangii,1921764
158436,Actinomyces tangfeifanii,1912795
22982,Adlercreutzia caecicola,747645
13647,Agrobacterium skierniewicense,1368417
134060,Algibacter wandonensis,1343161


In [19]:
df_bacd_.shape

(142, 2)

Check if these TaxIDs exist in taxdump.

In [20]:
df_bacd_ = df_bacd_[df_bacd_['taxid'].isin(dump)]
df_bacd_.shape[0]

138

Check if these taxids are species

In [21]:
dfx_bacd = df_bacd_[df_bacd_['taxid'].apply(lambda x: dump[x]['rank'] == 'species')]
dfx_bacd.shape[0]

119

Add species taxids

In [22]:
# for idx, val in dfx_bacd['taxid'].iteritems():
for idx, val in dfx_bacd['taxid'].items():
    df_bacd.at[idx, 'sptid'] = val

If not species, check if they are lower than species.

In [23]:
dfx_bacd = df_bacd_[df_bacd_['taxid'].apply(lambda x: dump[x]['rank'] != 'species')]
dfx_bacd.shape[0]

19

In [24]:
dfx_bacd['taxid'].apply(lambda x: dump[x]['rank']).head()

ID
13647    strain
2095     strain
2099     strain
1225     strain
1254     strain
Name: taxid, dtype: object

Elevate ranks to species

In [25]:
tids_ = dfx_bacd['taxid'].apply(get_species)

In [26]:
# for idx, val in tids_.iteritems():
for idx, val in tids_.items():
    df_bacd.at[idx, 'sptid'] = val
    df_bacd.at[idx, 'sptid'] = val

Check remaining

In [27]:
df_bacd_ = df_bacd[df_bacd['sptid'].isna()]
df_bacd_.head()

Unnamed: 0_level_0,species,shape,taxid,length_gmean,width_gmean,length_amean,width_amean,volume_gmean,surface_gmean,volume_amean,surface_amean,length_ranges_fix,width_ranges_fix,sptid
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
134060,Algibacter wandonensis,rod-shaped,1343161.0,3.3,0.3,3.3,0.3,0.226195,3.110177,0.226195,3.110177,"[3.3, 3.3]","[0.3, 0.3]",
140729,Altererythrobacter aquaemixtae,rod-shaped,,0.387298,0.282843,0.4,0.3,0.018411,0.344144,0.021206,0.376991,"[0.3, 0.5]","[0.2, 0.4]",
140987,Altererythrobacter halimionae,rod-shaped,,2.37916,0.69282,2.575,0.73,0.80986,5.178382,0.975893,5.905409,"[1.59, 3.56]","[0.5, 0.96]",
158561,Altererythrobacter mangrovi,rod-shaped,,1.232883,0.424264,1.35,0.45,0.154302,1.643266,0.190852,1.908518,"[0.8, 1.9]","[0.3, 0.6]",
140815,Altererythrobacter rigui,rod-shaped,,0.604483,0.493559,0.605,0.5,0.084175,0.937287,0.086067,0.950332,"[0.58, 0.63]","[0.42, 0.58]",


In [28]:
df_bacd_.shape[0]

22

Manually curate these entries

In [29]:
tidmap = {'134060': '221126',
          '140729': '1958940',
          '140987': '1926630',
          '158561': '1982042',
          '140815': '1708790',
          '140743': '1476466',
          '158655': '1912871',
          '140702': '1904463',
          '158725': '1983720',
          '158610': '1707093',
          '141013': '2036016',
          '140739': '1505588',
          '158534': '1967665',
          '141131': '1462440',
          '140758': '1850444',
          '133146': '1542729',
          '134303': '81932',
          '140782': '1807134',
          '134174': '323273',
          '24597':  '1276936',
          '1496':   '698769'}

In [30]:
for key, val in tidmap.items():
    df_bacd.at[int(key), 'sptid'] = val

In [31]:
df_bacd = df_bacd.dropna(subset=['sptid'])
df_bacd.shape[0]

4907

Curate species

In [32]:
df_bacd.at[3073, 'shape'] = 'rod-shaped'

In [33]:
shapes = dict(df_bacd[['sptid', 'shape']].drop_duplicates('sptid').values)

In [34]:
ranges_length = dict(df_bacd[['sptid', 'length_ranges_fix']].drop_duplicates('sptid').values)
ranges_length = dict(df_bacd[['sptid', 'width_ranges_fix']].drop_duplicates('sptid').values)

Check duplicate species

In [35]:
df_bacd['sptid'].value_counts().value_counts()

count
1    4847
2      24
3       4
Name: count, dtype: int64

Group organisms by species taxid and calculate geometric/arithmetic mean per species.

In [36]:
cols_gmean = ['length_gmean', 'width_gmean', 'volume_gmean', 'surface_gmean']
cols_amean = ['length_amean', 'width_amean', 'volume_amean', 'surface_amean']

In [37]:
tmp_gmean = df_bacd.groupby('sptid')[cols_gmean].agg(gmean)
tmp_amean = df_bacd.groupby('sptid')[cols_amean].agg('mean')

In [38]:
dfr_bacd = pd.concat([tmp_gmean, tmp_amean], axis = 1)

Add shape, ranges, and species name.

In [39]:
dfr_bacd['shape'] = dfr_bacd.index.map(shapes)

In [40]:
def map_ranges(df, metric):
    ranges = {}
    for idx, row in df.iterrows():
        tid = row['sptid']
        rng = row[f'{metric}_ranges_fix'][1:-1].split(',')
        rng = [rng[0], rng[1]]
        if tid not in ranges.keys():
            ranges[tid] = rng
        else:
            ranges[tid].append(rng[0])
            ranges[tid].append(rng[1])
    return ranges

In [41]:
ranges_length = map_ranges(df_bacd, 'length')
ranges_width = map_ranges(df_bacd, 'width')

In [42]:
dfr_bacd['length_ranges'] = dfr_bacd.index.map(ranges_length)
dfr_bacd['width_ranges'] = dfr_bacd.index.map(ranges_width)

In [43]:
dfr_bacd['species'] = dfr_bacd.index.to_series().apply(lambda x: dump[str(x)]['name'])

Organize

In [44]:
dfr_bacd = dfr_bacd.loc[sorted(dfr_bacd.index, key = int)]
dfr_bacd.index.names = ['taxid']

In [45]:
dfr_bacd.shape

(4875, 12)

Fill more ranks

In [46]:
ranks = ['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom']

In [47]:
res = {x: {} for x in ranks[1:]}

In [48]:
rankset = set(ranks[1:])

In [49]:
for tid in dfr_bacd.index.astype(str):
    this = tid
    while True:
        rank = dump[this]['rank']
        if rank in rankset:
            res[rank][tid] = dump[this]['name']
        parent = dump[this]['parent']
        if this == parent:
            break
        this = parent

In [50]:
for rank in ranks[1:]:
    dfr_bacd[rank] = dfr_bacd.index.map(res[rank])

In [51]:
dfr_bacd['source'] = 'bacdive'

Statistics

In [52]:
for rank in ranks:
    print(rank, dfr_bacd[rank].nunique())

species 4875
genus 1665
family 384
order 160
class 68
phylum 32
superkingdom 2


Save dataframe

In [53]:
dfr_bacd.to_csv('preprocess/annot/species_bacdive.tsv', sep = '\t')

# PCC

In [54]:
df_pcc = pd.read_table('preprocess/data_pcc.tsv', index_col = 0)
df_pcc.shape

(225, 17)

Drop useless columns

In [55]:
df_pcc.columns

Index(['Length', 'taxID', 'Name', 'Width', 'shape', 'length_gmean',
       'width_gmean', 'length_amean', 'width_amean', 'volume_gmean',
       'surface_gmean', 'volume_amean', 'surface_amean', 'length_ranges',
       'width_ranges', 'length_ranges_fix', 'width_ranges_fix'],
      dtype='object')

In [56]:
columns = ['Length', 'Width',
          'length_ranges', 'width_ranges']
df_pcc = df_pcc.drop(columns, axis = 1)

Rename columns

In [57]:
df_pcc.rename(columns = {'Name': 'species', 'taxID': 'taxid', }, inplace = True)

### Match species to NCBI taxonomy

In [58]:
spnames = {v['name']: k for k, v in dump.items() if v['rank'] == 'species'}

In [59]:
df_pcc['species'].astype(str).isin(spnames).value_counts()

species
False    169
True      56
Name: count, dtype: int64

In [60]:
tids = df_pcc['taxid'].astype(str)

In [61]:
np.unique(tids[tids.notna()].index.astype(str).isin(dump), return_counts = True)

(array([False,  True]), array([ 56, 169]))

Check if taxids are un dump

In [62]:
df_pcc['taxid_rank'] = df_pcc['taxid'].astype(str).apply(lambda x: dump[x]['rank'] if x in dump.keys() else np.NaN)
df_pcc.head()

Unnamed: 0,taxid,species,shape,length_gmean,width_gmean,length_amean,width_amean,volume_gmean,surface_gmean,volume_amean,surface_amean,length_ranges_fix,width_ranges_fix,taxid_rank
5,269084,Synechococcus elongatus PCC 6301,sphere/spheroid,1.095445,1.095445,1.1,1.1,0.688288,3.769911,0.69691,3.801327,"[1.0, 1.2]","[1.0, 1.2]",strain
8,118323,Oscillatoria acuminata,sphere/spheroid,4.472136,4.472136,4.5,4.5,46.832098,62.831853,47.712938,63.617251,"[4.0, 5.0]","[4.0, 5.0]",species
10,1184,Leptolyngbya boryana,sphere/spheroid,2.034699,2.034699,2.05,2.05,4.410615,13.006194,4.510869,13.202543,"[1.8, 2.3]","[1.8, 2.3]",species
11,59930,Cyanobium gracile,sphere/spheroid,0.894427,0.894427,0.9,0.9,0.374657,2.513274,0.381704,2.54469,"[0.8, 1.0]","[0.8, 1.0]",species
12,113355,Geminocystis herdmanii PCC 6308,sphere/spheroid,4.090623,4.090623,4.125,4.125,35.839929,52.5689,36.751112,53.456162,"[3.5, 5.0]","[3.5, 5.0]",strain


Check entries that could not be assigned

In [63]:
df_pcc[df_pcc['taxid_rank'].isna()]

Unnamed: 0,taxid,species,shape,length_gmean,width_gmean,length_amean,width_amean,volume_gmean,surface_gmean,volume_amean,surface_amean,length_ranges_fix,width_ranges_fix,taxid_rank
52,2174229,Synechocystis sp.,sphere/spheroid,2.397916,2.397916,2.4,2.4,7.219388,18.064158,7.238229,18.095574,"[2.3, 2.5]","[2.3, 2.5]",


Assign manually

In [64]:
df_pcc.at[52, 'taxid'] = 1148
df_pcc.at[52, 'taxID_rank'] = 'species'

Elevate subspecies to species

In [65]:
def get_species(tid):
    if dump[tid]['rank'] == 'species':
        return tid
    elif dump[tid]['rank'] == 'genus':
        return tid
    return get_species(dump[tid]['parent'])

In [66]:
def get_species_name(tid):
    if dump[tid]['rank'] == 'species':
        return dump[tid]['name']
    elif dump[tid]['rank'] == 'genus':
        return dump[tid]['name']
    return get_species_name(dump[tid]['parent'])

In [67]:
df_pcc['sptid'] = df_pcc['taxid'].astype(str).apply(get_species)

In [68]:
df_pcc['species'] = df_pcc['taxid'].astype(str).apply(get_species_name)

In [69]:
df_pcc.drop(columns = ['taxid', 'taxID_rank'], inplace = True)

In [70]:
df_pcc.shape

(225, 14)

Check duplicate species

In [71]:
df_pcc['sptid'].value_counts().value_counts()

count
1     74
2      7
5      5
6      3
4      2
3      2
33     1
15     1
13     1
10     1
9      1
Name: count, dtype: int64

Group organisms by species taxid and calculate geometric/arithmetic mean per species.

In [72]:
cols_gmean = ['length_gmean', 'width_gmean', 'volume_gmean', 'surface_gmean']
cols_amean = ['length_amean', 'width_amean', 'volume_amean', 'surface_amean']

In [73]:
tmp_gmean = df_pcc.groupby('sptid')[cols_gmean].agg(gmean)
tmp_amean = df_pcc.groupby('sptid')[cols_amean].agg('mean')

In [74]:
dfr_pcc = pd.concat([tmp_gmean, tmp_amean], axis = 1)

Add shape, ranges, and species name.

In [75]:
shapes = dict(df_pcc[['sptid', 'shape']].drop_duplicates('sptid').values)
dfr_pcc['shape'] = dfr_pcc.index.map(shapes)

In [76]:
ranges_length = map_ranges(df_pcc, 'length')
ranges_width = map_ranges(df_pcc, 'width')

In [77]:
dfr_pcc['length_ranges'] = dfr_pcc.index.map(ranges_length)
dfr_pcc['width_ranges'] = dfr_pcc.index.map(ranges_width)

In [78]:
dfr_pcc['species'] = dfr_pcc.index.to_series().apply(lambda x: dump[str(x)]['name'])

Organize

In [79]:
dfr_pcc = dfr_pcc.loc[sorted(dfr_pcc.index, key = int)]

In [80]:
dfr_pcc.index.names = ['taxid']

In [81]:
dfr_pcc.shape

(98, 12)

Fill more ranks

In [82]:
ranks = ['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom']

In [83]:
res = {x: {} for x in ranks[1:]}

In [84]:
rankset = set(ranks[1:])

In [85]:
for tid in dfr_pcc.index.astype(str):
    this = tid
    while True:
        rank = dump[this]['rank']
        if rank in rankset:
            res[rank][tid] = dump[this]['name']
        parent = dump[this]['parent']
        if this == parent:
            break
        this = parent

In [86]:
for rank in ranks[1:]:
    dfr_pcc[rank] = dfr_pcc.index.map(res[rank])

In [87]:
# Correct entries with no species
dfr_pcc['species'] = dfr_pcc['species'].astype(str).apply(lambda x: x if len(x.split(' ')) > 1 else np.NaN)

In [88]:
dfr_pcc.shape

(98, 18)

Statistics

In [89]:
for rank in ranks:
    print(rank, dfr_pcc[rank].nunique())

species 75
genus 39
family 24
order 10
class 1
phylum 1
superkingdom 1


Output

In [90]:
dfr_pcc['source'] = 'pcc'

In [91]:
dfr_pcc.to_csv('preprocess/annot/species_pcc.tsv', sep = '\t')

# PubMed

In [92]:
df_pubmed = pd.read_table('preprocess/data_pubmed.tsv', index_col = 0)
df_pubmed.shape

(54, 17)

Drop useless columns

In [93]:
df_pubmed.columns

Index(['Name', 'Length', 'Width', 'Shape', 'reference', 'length_gmean',
       'width_gmean', 'length_amean', 'width_amean', 'volume_gmean',
       'surface_gmean', 'volume_amean', 'surface_amean', 'length_ranges',
       'width_ranges', 'length_ranges_fix', 'width_ranges_fix'],
      dtype='object')

In [94]:
columns = ['Length', 'Width',
          'length_ranges', 'width_ranges']
df_pubmed = df_pubmed.drop(columns, axis = 1)

Rename columns

In [95]:
df_pubmed.rename(columns = {'Name': 'species', 'Shape': 'shape'}, inplace = True)

### Match species to NCBI taxonomy


In [96]:
spnames = {v['name']: k for k, v in dump.items() if v['rank'] == 'species'}

In [97]:
df_pubmed['species'].astype(str).isin(spnames).value_counts()

species
True     49
False     5
Name: count, dtype: int64

Add species taxid

In [98]:
df_pubmed['sptid'] = df_pubmed['species'].map(spnames)

Check remaining

In [99]:
df_pubmed[df_pubmed['sptid'].isna()]

Unnamed: 0,species,shape,reference,length_gmean,width_gmean,length_amean,width_amean,volume_gmean,surface_gmean,volume_amean,surface_amean,length_ranges_fix,width_ranges_fix,sptid
40,Pelagibacter ubique,rod,"Rappé, M., Connon, S., Vergin, K. et al. Culti...",0.573847,0.154919,0.63,0.16,0.009843368,0.279287,0.01159457,0.316673,"[0.37, 0.89]","[0.12, 0.2]",
44,Prochlorococcus,sphere,"Partensky, F., Hess, W. R., & Vaulot, D. (1999...",0.591608,0.591608,0.6,0.6,0.1084178,1.099557,0.1130973,1.130973,"[0.5, 0.7]","[0.5, 0.7]",
46,Cristispira anodontae,spiral,"Spirochaeta anodontae Keysselitz, Arb. a. d. k...",62.225397,0.979796,66.0,1.0,46.67059,191.537213,51.57448,207.345115,"[44.0, 88.0]","[0.8, 1.2]",
47,Cristispira pinnae,spiral,"Spirochaete pinnae Gonder, Cent. f. Bakt., I A...",24.494897,1.224745,35.0,1.75,28.37641,94.24778,82.78178,192.42255,"[10.0, 60.0]","[0.5, 3.0]",
53,Thiomargarita magnifica,rod,https://doi.org/10.1101/2022.02.16.480423,9720.0,30.0,9720.0,30.0,6863595.0,916088.417787,6863595.0,916088.417787,"[9720.0, 9720.0]","[30.0, 30.0]",


Manually assign taxid

In [100]:
df_pubmed.at[40, 'sptid'] = '198252'
df_pubmed.at[44, 'sptid'] = '1218'
df_pubmed.at[46, 'sptid'] = '44757'
df_pubmed.at[47, 'sptid'] = '44757'
# For Thiomargarita magnifica, assign genus taxID
df_pubmed.at[53, 'sptid'] = '90372'

In [101]:
def get_species_name(tid):
    if dump[tid]['rank'] == 'species':
        return dump[tid]['name']
    elif dump[tid]['rank'] == 'genus':
        return dump[tid]['name']
    return get_species_name(dump[tid]['parent'])

In [102]:
df_pubmed['species'] = df_pubmed['sptid'].astype(str).apply(get_species_name)

Check duplicate species

In [103]:
df_pubmed['sptid'].value_counts().value_counts()

count
1    52
2     1
Name: count, dtype: int64

Group organisms by species taxid and calculate geometric/arithmetic mean per species.

In [104]:
cols_gmean = ['length_gmean', 'width_gmean', 'volume_gmean', 'surface_gmean']
cols_amean = ['length_amean', 'width_amean', 'volume_amean', 'surface_amean']

In [105]:
tmp_gmean = df_pubmed.groupby('sptid')[cols_gmean].agg(gmean)
tmp_amean = df_pubmed.groupby('sptid')[cols_amean].agg('mean')

In [106]:
dfr_pubmed = pd.concat([tmp_gmean, tmp_amean], axis = 1)

Add shape, ranges, and species name.

In [107]:
shapes = dict(df_pubmed[['sptid', 'shape']].drop_duplicates('sptid').values)
dfr_pubmed['shape'] = dfr_pubmed.index.map(shapes)

In [108]:
ranges_length = map_ranges(df_pubmed, 'length')
ranges_width = map_ranges(df_pubmed, 'width')

In [109]:
dfr_pubmed['length_ranges'] = dfr_pubmed.index.map(ranges_length)
dfr_pubmed['width_ranges'] = dfr_pubmed.index.map(ranges_width)

In [110]:
dfr_pubmed['species'] = dfr_pubmed.index.to_series().apply(lambda x: dump[str(x)]['name'])

Organize

In [111]:
dfr_pubmed = dfr_pubmed.loc[sorted(dfr_pubmed.index, key = int)]

In [112]:
dfr_pubmed.index.names = ['taxid']

In [113]:
dfr_pubmed.shape

(53, 12)

Fill more ranks

In [114]:
ranks = ['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom']

In [115]:
res = {x: {} for x in ranks[1:]}

In [116]:
rankset = set(ranks[1:])

In [117]:
for tid in dfr_pubmed.index.astype(str):
    this = tid
    while True:
        rank = dump[this]['rank']
        if rank in rankset:
            res[rank][tid] = dump[this]['name']
        parent = dump[this]['parent']
        if this == parent:
            break
        this = parent

In [118]:
for rank in ranks[1:]:
    dfr_pubmed[rank] = dfr_pubmed.index.map(res[rank])

In [119]:
# Correct entries with no species
dfr_pubmed['species'] = dfr_pubmed['species'].astype(str).apply(lambda x: x if len(x.split(' ')) > 1 else np.NaN)

In [120]:
dfr_pubmed.shape

(53, 18)

Statistics

In [121]:
for rank in ranks:
    print(rank, dfr_pubmed[rank].nunique())

species 50
genus 33
family 30
order 22
class 11
phylum 8
superkingdom 2


Save dataframe

In [122]:
dfr_pubmed['source'] = 'pubmed'

In [123]:
dfr_pubmed.to_csv('preprocess/annot/species_pcc.tsv', sep = '\t')

# Bergey's manual volume 1

In [124]:
df_bm1 = pd.read_table('preprocess/data_bm1.tsv', sep = '\t', index_col = 0)
df_bm1.shape

(103, 20)

Drop useless columns

In [125]:
df_bm1.columns

Index(['Name', 'Shape', 'Length', 'Width', 'Diameter', 'Subspecies', 'File #',
       'Notes', 'length_gmean', 'width_gmean', 'length_amean', 'width_amean',
       'volume_gmean', 'surface_gmean', 'volume_amean', 'surface_amean',
       'length_ranges', 'width_ranges', 'length_ranges_fix',
       'width_ranges_fix'],
      dtype='object')

In [126]:
columns = ['Length', 'Width', 'Diameter', 'Subspecies', 'File #', 'Notes', 'length_ranges', 'width_ranges']
df_bm1 = df_bm1.drop(columns, axis = 1)

Rename columns

In [127]:
df_bm1.rename(columns = {'Name': 'species', 'Shape': 'shape'}, inplace = True)

### Match species to NCBI taxonomy

In [128]:
spnames = {v['name']: k for k, v in dump.items() if v['rank'] == 'species'}

In [129]:
df_bm1['species'].astype(str).isin(spnames).value_counts()

species
True     80
False    23
Name: count, dtype: int64

Add species taxid

In [130]:
df_bm1['sptid'] = df_bm1['species'].map(spnames)

Check remaining

In [131]:
tmp = df_bm1[df_bm1['sptid'].isna()]
tmp.shape

(23, 13)

Match subspecies name

In [132]:
sspnames = {v['name']: k for k, v in dump.items() if v['rank'] == 'subspecies'}

In [133]:
tmp['species'].astype(str).isin(sspnames).value_counts()

species
False    23
Name: count, dtype: int64

Manually add taxid according to NCBI

In [134]:
df_bm1.drop([22, 23, 69], inplace = True)

In [135]:
df_bm1.at[14, 'sptid'], df_bm1.at[14, 'species'] = '35616', 'Pyrodictium brockii'
df_bm1.at[50, 'sptid'], df_bm1.at[50, 'species'] = '2190', 'Methanocaldococcus jannaschii'
df_bm1.at[86, 'sptid'], df_bm1.at[86, 'species'] = '2190', 'Methanohalophilus mahii'

In [136]:
df_bm1.at[1, 'sptid'], df_bm1.at[1, 'species'] = '70771', 'Pyrobaculum neutrophilum'
df_bm1.at[18, 'sptid'], df_bm1.at[18, 'species'] = '43687', 'Metallosphaera sedula'
df_bm1.at[27, 'sptid'], df_bm1.at[27, 'species'] = '83565', 'Methanobacterium espanolae'
df_bm1.at[34, 'sptid'], df_bm1.at[34, 'species'] = '49547', 'Methanobrevibacter curvatus'
df_bm1.at[36, 'sptid'], df_bm1.at[36, 'species'] = '2173', 'Methanobrevibacter smithii'
df_bm1.at[43, 'sptid'], df_bm1.at[43, 'species'] = '145261', 'Methanothermobacter wolfeii'
df_bm1.at[46, 'sptid'], df_bm1.at[46, 'species'] = '2187', 'Methanococcus vannielii'
df_bm1.at[58, 'sptid'], df_bm1.at[58, 'species'] = '83986', 'Methanoculleus bourgensis'
df_bm1.at[59, 'sptid'], df_bm1.at[59, 'species'] = '83986', 'Methanoculleus bourgensis'
df_bm1.at[61, 'sptid'], df_bm1.at[61, 'species'] = '2200', 'Methanoculleus thermophilus'
df_bm1.at[66, 'sptid'], df_bm1.at[66, 'species'] = '2200', 'Methanoculleus thermophilus'
df_bm1.at[70, 'sptid'], df_bm1.at[70, 'species'] = '33865', 'Methanoplanus endosymbiosus'
df_bm1.at[71, 'sptid'], df_bm1.at[71, 'species'] = '54120', 'Methanolacinia petrolearia'
df_bm1.at[76, 'sptid'], df_bm1.at[76, 'species'] = '2203', 'Methanospirillum hungatei'
df_bm1.at[95, 'sptid'], df_bm1.at[95, 'species'] = '2223', 'Methanothrix soehngenii'
df_bm1.at[96, 'sptid'], df_bm1.at[96, 'species'] = '2224', 'Methanothrix thermoacetophila'
df_bm1.at[97, 'sptid'], df_bm1.at[97, 'species'] = '2242', 'Halobacterium salinarum'

Check duplicates

In [137]:
df_bm1['sptid'].value_counts().value_counts()

count
1    89
2     4
3     1
Name: count, dtype: int64

Group organisms by species taxid and calculate geometric/arithmetic mean per species.

In [138]:
cols_gmean = ['length_gmean', 'width_gmean', 'volume_gmean', 'surface_gmean']
cols_amean = ['length_amean', 'width_amean', 'volume_amean', 'surface_amean']

In [139]:
tmp_gmean = df_bm1.groupby('sptid')[cols_gmean].agg(gmean)
tmp_amean = df_bm1.groupby('sptid')[cols_amean].agg('mean')

In [140]:
dfr_bm1 = pd.concat([tmp_gmean, tmp_amean], axis = 1)

Add shape, ranges, and species name

In [141]:
shapes = dict(df_bm1[['sptid', 'shape']].drop_duplicates('sptid').values)
dfr_bm1['shape'] = dfr_bm1.index.map(shapes)

In [142]:
ranges_length = map_ranges(df_bm1, 'length')
ranges_width = map_ranges(df_bm1, 'width')

In [143]:
dfr_bm1['length_ranges'] = dfr_bm1.index.map(ranges_length)
dfr_bm1['width_ranges'] = dfr_bm1.index.map(ranges_width)

In [144]:
dfr_bm1['species'] = dfr_bm1.index.to_series().apply(lambda x: dump[str(x)]['name'])

Organize

In [145]:
dfr_bm1 = dfr_bm1.loc[sorted(dfr_bm1.index, key = int)]

In [146]:
dfr_bm1.index.names = ['taxid']

In [147]:
dfr_bm1.shape

(94, 12)

Fill more ranks

In [148]:
ranks = ['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom']

In [149]:
res = {x: {} for x in ranks[1:]}

In [150]:
rankset = set(ranks[1:])

In [151]:
for tid in dfr_bm1.index.astype(str):
    this = tid
    while True:
        rank = dump[this]['rank']
        if rank in rankset:
            res[rank][tid] = dump[this]['name']
        parent = dump[this]['parent']
        if this == parent:
            break
        this = parent

In [152]:
for rank in ranks[1:]:
    dfr_bm1[rank] = dfr_bm1.index.map(res[rank])

In [153]:
# Correct entries with no species
dfr_bm1['species'] = dfr_bm1['species'].astype(str).apply(lambda x: x if len(x.split(' ')) > 1 else np.NaN)

In [154]:
dfr_bm1.shape

(94, 18)

Statistics

In [155]:
for rank in ranks:
    print(rank, dfr_bm1[rank].nunique())

species 94
genus 43
family 17
order 9
class 5
phylum 2
superkingdom 1


Save dataframe

In [156]:
dfr_bm1['source'] = 'bm1'

In [157]:
dfr_bm1.to_csv('preprocess/annot/species_bm1.tsv', sep = '\t')

# Bergey's manual volume 4

In [158]:
df_bm4 = pd.read_table('preprocess/data_bm4.tsv', sep = '\t', index_col = 0)
df_bm4.shape

(346, 20)

Drop useless columns

In [159]:
df_bm4.columns

Index(['Name', 'Date Modified', 'Shape', 'Length', 'Width', 'Diameter',
       'Subspecies', 'File #', 'length_gmean', 'width_gmean', 'length_amean',
       'width_amean', 'volume_gmean', 'surface_gmean', 'volume_amean',
       'surface_amean', 'length_ranges', 'width_ranges', 'length_ranges_fix',
       'width_ranges_fix'],
      dtype='object')

In [160]:
columns = ['Length', 'Width', 'Diameter', 'Subspecies', 'File #', 'Date Modified', 'length_ranges', 'width_ranges']
df_bm4 = df_bm4.drop(columns, axis = 1)

Rename columns

In [161]:
df_bm4.rename(columns = {'Name': 'species', 'Shape': 'shape'}, inplace = True)

### Match species to NCBI taxonomy

In [162]:
spnames = {v['name']: k for k, v in dump.items() if v['rank'] == 'species'}

In [163]:
df_bm4['species'].astype(str).isin(spnames).value_counts()

species
True     261
False     85
Name: count, dtype: int64

Add species taxid

In [164]:
df_bm4['sptid'] = df_bm4['species'].map(spnames)

Check remaining

In [165]:
tmp = df_bm4[df_bm4['sptid'].isna()]
tmp.shape

(85, 13)

Match subspecies name

In [166]:
sspnames = {v['name']: k for k, v in dump.items() if v['rank'] == 'subspecies'}

In [167]:
tmp['species'].astype(str).isin(sspnames).value_counts()

species
False    85
Name: count, dtype: int64

Manually add taxid according to NCBI

In [168]:
df_bm4.drop([38, 72, 84, 90, 221, 337, 340], inplace = True)

In [169]:
df_bm4.at[28, 'sptid'], df_bm4.at[28, 'name'] = '215591', 'Gracilinema caldarium'
df_bm4.at[33, 'sptid'], df_bm4.at[33, 'name'] = '152', 'Zuelzera stenostrepta'
df_bm4.at[60, 'sptid'], df_bm4.at[60, 'name'] = '88916', 'Borreliella spielmanii'
df_bm4.at[63, 'sptid'], df_bm4.at[63, 'name'] = '57863', 'Borreliella turdi'
df_bm4.at[113, 'sptid'], df_bm4.at[113, 'name'] = '100053', 'Leptospira alexanderi'
df_bm4.at[117, 'sptid'], df_bm4.at[117, 'name'] = '2110', 'Mycoplasmopsis agalactiae'
df_bm4.at[132, 'sptid'], df_bm4.at[132, 'name'] = '2113', 'Mycoplasmopsis californica'
df_bm4.at[134, 'sptid'], df_bm4.at[134, 'name'] = '29555', 'Mycoplasmopsis canis'
df_bm4.at[139, 'sptid'], df_bm4.at[139, 'name'] = '114880', 'Mycoplasmopsis columbinasalis'
df_bm4.at[140, 'sptid'], df_bm4.at[140, 'name'] = '114881', 'Mycoplasmopsis columbina'
df_bm4.at[141, 'sptid'], df_bm4.at[140, 'name'] = '171282', 'Mycoplasmopsis columboralis'
df_bm4.at[146, 'sptid'], df_bm4.at[146, 'name'] = '171284', 'Mycoplasmopsis cynos'
df_bm4.at[155, 'sptid'], df_bm4.at[155, 'name'] = '114881', 'Mycoplasmopsis felis'
df_bm4.at[158, 'sptid'], df_bm4.at[158, 'name'] = '29556', 'Mycoplasmopsis gallinacea'
df_bm4.at[163, 'sptid'], df_bm4.at[163, 'name'] = '171285', 'Mycoplasmopsis glycophila'
df_bm4.at[178, 'sptid'], df_bm4.at[178, 'name'] = '114885', 'Mycoplasmopsis maculosa'
df_bm4.at[179, 'sptid'], df_bm4.at[179, 'name'] = '29561', 'Mycoplasmopsis meleagridis'
df_bm4.at[197, 'sptid'], df_bm4.at[197, 'name'] = '55604', 'Mycoplasmopsis primatum'
df_bm4.at[199, 'sptid'], df_bm4.at[199, 'name'] = '2107', 'Mycoplasmopsis pulmonis'
df_bm4.at[210, 'sptid'], df_bm4.at[210, 'name'] = '171291', 'Mycoplasmopsis verecunda'
df_bm4.at[223, 'sptid'], df_bm4.at[223, 'name'] = '219290', 'Mycoplasma vulturii'

In [170]:
df_bm4.at[7, 'sptid'], df_bm4.at[7, 'name'] = '860', 'Fusobacterium periodonticum'
df_bm4.at[18, 'sptid'], df_bm4.at[18, 'name'] = '157692', 'Pseudoleptotrichia goodfellowii'
df_bm4.at[23, 'sptid'], df_bm4.at[23, 'name'] = '46356', 'Alkalispirochaeta alkalica'
df_bm4.at[24, 'sptid'], df_bm4.at[24, 'name'] = '159291', 'Alkalispirochaeta americana'
df_bm4.at[27, 'sptid'], df_bm4.at[27, 'name'] = '148', 'Sediminispirochaeta bajacaliforniensis'
df_bm4.at[31, 'sptid'], df_bm4.at[31, 'name'] = '151', 'Oceanispirochaeta litoralis'
df_bm4.at[32, 'sptid'], df_bm4.at[32, 'name'] = '55206', 'Sediminispirochaeta smaragdinae'
df_bm4.at[35, 'sptid'], df_bm4.at[35, 'name'] = '156', 'Treponema zuelzerae'
df_bm4.at[37, 'sptid'], df_bm4.at[37, 'name'] = '29518', 'Borreliella afzelii'
df_bm4.at[40, 'sptid'], df_bm4.at[40, 'name'] = '139', 'Borreliella burgdorferi'
df_bm4.at[46, 'sptid'], df_bm4.at[46, 'name'] = '29519', 'Borreliella garinii'
df_bm4.at[51, 'sptid'], df_bm4.at[51, 'name'] = '34095', 'Borreliella japonica'
df_bm4.at[53, 'sptid'], df_bm4.at[53, 'name'] = '100177', 'Borreliella lusitaniae'
df_bm4.at[59, 'sptid'], df_bm4.at[59, 'name'] = '87162', 'Borreliella sinica'
df_bm4.at[61, 'sptid'], df_bm4.at[61, 'name'] = '56146', 'Borreliella tanukii'
df_bm4.at[65, 'sptid'], df_bm4.at[65, 'name'] = '62088', 'Borreliella valaisiana'
df_bm4.at[88, 'sptid'], df_bm4.at[88, 'name'] = '392334', 'Candidatus Treponema suis'
df_bm4.at[89, 'sptid'], df_bm4.at[89, 'name'] = '158', 'Treponema denticola'
df_bm4.at[95, 'sptid'], df_bm4.at[95, 'name'] = '84377', 'Brachyspira intermedia'
df_bm4.at[116, 'sptid'], df_bm4.at[116, 'name'] = '51362', 'Mycoplasmopsis adleri'
df_bm4.at[118, 'sptid'], df_bm4.at[118, 'name'] = '33922', 'Mycoplasmopsis agassizii'
df_bm4.at[120, 'sptid'], df_bm4.at[120, 'name'] = '47687', 'Mycoplasmopsis alligatoris'
df_bm4.at[123, 'sptid'], df_bm4.at[123, 'name'] = '171279', 'Mycoplasmopsis anatis'
df_bm4.at[125, 'sptid'], df_bm4.at[125, 'name'] = '2094', 'Mycoplasmopsis arginini'
df_bm4.at[128, 'sptid'], df_bm4.at[128, 'name'] = '28903', 'Mycoplasmopsis bovis'
df_bm4.at[144, 'sptid'], df_bm4.at[144, 'name'] = '171283', 'Mycoplasmopsis cricetuli'
df_bm4.at[148, 'sptid'], df_bm4.at[148, 'name'] = '53558', 'Mycoplasmopsis edwardii'
df_bm4.at[154, 'sptid'], df_bm4.at[154, 'name'] = '35768', 'Mycoplasmopsis felifaucium'
df_bm4.at[156, 'sptid'], df_bm4.at[156, 'name'] = '2115', 'Mycoplasmopsis fermentans'
df_bm4.at[159, 'sptid'], df_bm4.at[159, 'name'] = '29557', 'Mycoplasmopsis gallinarum'
df_bm4.at[161, 'sptid'], df_bm4.at[161, 'name'] = '76629', 'Mycoplasmopsis gallopavonis'
df_bm4.at[177, 'sptid'], df_bm4.at[177, 'name'] = '114884', 'Mycoplasmopsis lipofaciens'
df_bm4.at[184, 'sptid'], df_bm4.at[184, 'name'] = '458208', 'Mycoplasmopsis mucosicanis'
df_bm4.at[186, 'sptid'], df_bm4.at[186, 'name'] = '171289', 'Mycoplasmopsis mustelae'
df_bm4.at[194, 'sptid'], df_bm4.at[194, 'name'] = '142650', 'Mycoplasmopsis phocirhinis'
df_bm4.at[198, 'sptid'], df_bm4.at[198, 'name'] = '48003', 'Mycoplasmopsis pullorum'
df_bm4.at[204, 'sptid'], df_bm4.at[204, 'name'] = '39047', 'Mycoplasmopsis sturni'
df_bm4.at[208, 'sptid'], df_bm4.at[208, 'name'] = '2109', 'Mycoplasmopsis synoviae'
df_bm4.at[214, 'sptid'], df_bm4.at[214, 'name'] = '247278', 'Candidatus Mycoplasma haematoparvum'
df_bm4.at[215, 'sptid'], df_bm4.at[215, 'name'] = '432608', 'Candidatus Mycoplasma haematobovis'
df_bm4.at[216, 'sptid'], df_bm4.at[216, 'name'] = '112247', 'Candidatus Mycoplasma haemodidelphidis'
df_bm4.at[217, 'sptid'], df_bm4.at[217, 'name'] = '141391', 'Candidatus Mycoplasma haemolamae'
df_bm4.at[218, 'sptid'], df_bm4.at[218, 'name'] = '209446', 'Candidatus Mycoplasma haemominutum'
df_bm4.at[219, 'sptid'], df_bm4.at[219, 'name'] = '60452', 'Candidatus Mycoplasma ravipulmonis'
df_bm4.at[220, 'sptid'], df_bm4.at[220, 'name'] = '346879', 'Candidatus Mycoplasma turicensis'
df_bm4.at[280, 'sptid'], df_bm4.at[280, 'name'] = '2149', 'Mesoplasma entomophilum'
df_bm4.at[282, 'sptid'], df_bm4.at[282, 'name'] = '2151', 'Mesoplasma florum'
df_bm4.at[292, 'sptid'], df_bm4.at[292, 'name'] = '28224', 'Mesoplasma seiffertii'
df_bm4.at[301, 'sptid'], df_bm4.at[301, 'name'] = '44676', 'Geothrix fermentans'
df_bm4.at[328, 'sptid'], df_bm4.at[328, 'name'] = '362787', 'Candidatus Protochlamydia amoebophila'
df_bm4.at[329, 'sptid'], df_bm4.at[329, 'name'] = '324707', 'Candidatus Rhabdochlamydia crassificans'
df_bm4.at[330, 'sptid'], df_bm4.at[330, 'name'] = '225148', 'Candidatus Rhabdochlamydia porcellionis'
df_bm4.at[332, 'sptid'], df_bm4.at[332, 'name'] = '206681', 'Candidatus Fritschea bemisiae'
df_bm4.at[333, 'sptid'], df_bm4.at[333, 'name'] = '206690', 'Candidatus Fritschea eriococci'
df_bm4.at[336, 'sptid'], df_bm4.at[336, 'name'] = '119', 'Rubinisphaera brasiliensis'
df_bm4.at[338, 'sptid'], df_bm4.at[338, 'name'] = '120', 'Planctopirus limnophila'
df_bm4.at[339, 'sptid'], df_bm4.at[339, 'name'] = '122', 'Gimesia maris'

Check duplicates

In [171]:
df_bm4['sptid'].value_counts().value_counts()

count
1    327
2      6
Name: count, dtype: int64

Group organisms by species taxid and calculate geometric/arithmetic mean per species.

In [172]:
cols_gmean = ['length_gmean', 'width_gmean', 'volume_gmean', 'surface_gmean']
cols_amean = ['length_amean', 'width_amean', 'volume_amean', 'surface_amean']

In [173]:
tmp_gmean = df_bm4.groupby('sptid')[cols_gmean].agg(gmean)
tmp_amean = df_bm4.groupby('sptid')[cols_amean].agg('mean')

In [174]:
dfr_bm4 = pd.concat([tmp_gmean, tmp_amean], axis = 1)

Add shape, ranges, and species name

In [175]:
shapes = dict(df_bm4[['sptid', 'shape']].drop_duplicates('sptid').values)
dfr_bm4['shape'] = dfr_bm4.index.map(shapes)

In [176]:
ranges_length = map_ranges(df_bm4, 'length')
ranges_width = map_ranges(df_bm4, 'width')

In [177]:
dfr_bm4['length_ranges'] = dfr_bm4.index.map(ranges_length)
dfr_bm4['width_ranges'] = dfr_bm4.index.map(ranges_width)

In [178]:
dfr_bm4['species'] = dfr_bm4.index.to_series().apply(lambda x: dump[str(x)]['name'])

Organize

In [179]:
dfr_bm4 = dfr_bm4.loc[sorted(dfr_bm4.index, key = int)]

In [180]:
dfr_bm4.index.names = ['taxid']

In [181]:
dfr_bm4.shape

(333, 12)

Fill more ranks

In [182]:
ranks = ['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom']

In [183]:
res = {x: {} for x in ranks[1:]}

In [184]:
rankset = set(ranks[1:])

In [185]:
for tid in dfr_bm4.index.astype(str):
    this = tid
    while True:
        rank = dump[this]['rank']
        if rank in rankset:
            res[rank][tid] = dump[this]['name']
        parent = dump[this]['parent']
        if this == parent:
            break
        this = parent

In [186]:
for rank in ranks[1:]:
    dfr_bm4[rank] = dfr_bm4.index.map(res[rank])

In [187]:
# Correct entries with no species
dfr_bm4['species'] = dfr_bm4['species'].astype(str).apply(lambda x: x if len(x.split(' ')) > 1 else np.NaN)

Statistics

In [188]:
for rank in ranks:
    print(rank, dfr_bm4[rank].nunique())

species 333
genus 59
family 32
order 23
class 14
phylum 11
superkingdom 1


Save dataframe

In [189]:
dfr_bm4['source'] = 'bm4'

In [190]:
dfr_bm4.to_csv('preprocess/annot/species_bm4.tsv', sep = '\t')

# Vergey's manual volume 5

In [191]:
df_bm5 = pd.read_table('preprocess/data_bm5.tsv', sep = '\t', index_col = 0)
df_bm5.shape

(290, 19)

Drop useless columns

In [192]:
df_bm5.columns

Index(['rank', 'Name', 'shape', 'Length', 'Width', 'diameter', 'Unnamed: 6',
       'length_gmean', 'width_gmean', 'length_amean', 'width_amean',
       'volume_gmean', 'surface_gmean', 'volume_amean', 'surface_amean',
       'length_ranges', 'width_ranges', 'length_ranges_fix',
       'width_ranges_fix'],
      dtype='object')

In [193]:
columns = ['rank', 'Length', 'Width', 'diameter', 'Unnamed: 6', 'length_ranges', 'width_ranges']
df_bm5 = df_bm5.drop(columns, axis = 1)

Rename columns

In [194]:
df_bm5.rename(columns = {'Name': 'species'}, inplace = True)

### Match species to NCBI taxonomy

In [195]:
spnames = {v['name']: k for k, v in dump.items() if v['rank'] == 'species'}

In [196]:
df_bm5['species'].astype(str).isin(spnames).value_counts()

species
True     211
False     79
Name: count, dtype: int64

Add species taxid

In [197]:
df_bm5['sptid'] = df_bm5['species'].map(spnames)

Check the remaining

In [198]:
tmp = df_bm5[df_bm5['sptid'].isna()]
tmp.shape

(79, 13)

Manually add taxid according to NCBI

In [199]:
df_bm5.at[12, 'sptid'], df_bm5.at[12, 'name'] = '78448', 'Bifidobacterium pullorum'
df_bm5.at[66, 'sptid'], df_bm5.at[66, 'name'] = '1774', 'Mycobacteroides chelonae'
df_bm5.at[90, 'sptid'], df_bm5.at[90, 'name'] = '47312', 'Tsukamurella pulmonis'
df_bm5.at[236, 'sptid'], df_bm5.at[236, 'name'] = '355930', 'Yonghaparkia alkaliphila'
df_bm5.at[247, 'sptid'], df_bm5.at[247, 'name'] = '1750', 'Arachnia propionica'

In [200]:
df_bm5.at[1, 'sptid'], df_bm5.at[1, 'name'] = '52768', 'Schaalia georgiae'
df_bm5.at[3, 'sptid'], df_bm5.at[3, 'name'] = '52773', 'Schaalia meyeri'
df_bm5.at[4, 'sptid'], df_bm5.at[4, 'name'] = '131110', 'Schaalia radingae'
df_bm5.at[7, 'sptid'], df_bm5.at[7, 'name'] = '131111', 'Schaalia turicensis'
df_bm5.at[9, 'sptid'], df_bm5.at[9, 'name'] = '1661', 'Trueperella pyogenes'
df_bm5.at[19, 'sptid'], df_bm5.at[19, 'name'] = '1717', 'Corynebacterium diphtheriae'
df_bm5.at[36, 'sptid'], df_bm5.at[36, 'name'] = '1764', 'Mycobacterium avium'
df_bm5.at[38, 'sptid'], df_bm5.at[38, 'name'] = '265949', 'Mycobacterium pseudoshottsii'
df_bm5.at[43, 'sptid'], df_bm5.at[43, 'name'] = '28045', 'Mycobacterium celatum'
df_bm5.at[44, 'sptid'], df_bm5.at[44, 'name'] = '29314', 'Mycolicibacter hiberniae'
df_bm5.at[49, 'sptid'], df_bm5.at[49, 'name'] = '1793', 'Mycolicibacterium fallax'
df_bm5.at[50, 'sptid'], df_bm5.at[50, 'name'] = '182220', 'Mycolicibacterium murale'
df_bm5.at[51, 'sptid'], df_bm5.at[51, 'name'] = '39687', 'Mycolicibacterium austroafricanum'
df_bm5.at[52, 'sptid'], df_bm5.at[52, 'name'] = '110539', 'Mycolicibacterium vanbaalenii'
df_bm5.at[53, 'sptid'], df_bm5.at[53, 'name'] = '1810', 'Mycolicibacterium vaccae'
df_bm5.at[54, 'sptid'], df_bm5.at[54, 'name'] = '85968', 'Mycolicibacterium brumae'
df_bm5.at[55, 'sptid'], df_bm5.at[55, 'name'] = '28047', 'Mycolicibacterium confluentis'
df_bm5.at[56, 'sptid'], df_bm5.at[56, 'name'] = '81858', 'Mycolicibacterium elephantis'
df_bm5.at[57, 'sptid'], df_bm5.at[57, 'name'] = '36813', 'Mycolicibacterium pulveris'
df_bm5.at[58, 'sptid'], df_bm5.at[58, 'name'] = '43304', 'Mycolicibacterium peregrinum'
df_bm5.at[59, 'sptid'], df_bm5.at[59, 'name'] = '39694', 'Mycolicibacterium poriferae'
df_bm5.at[60, 'sptid'], df_bm5.at[60, 'name'] = '1766', 'Mycolicibacterium fortuitum'
df_bm5.at[61, 'sptid'], df_bm5.at[61, 'name'] = '67081', 'Mycolicibacterium alvei'
df_bm5.at[63, 'sptid'], df_bm5.at[63, 'name'] = '258505', 'Mycolicibacterium fluoranthenivorans'
df_bm5.at[65, 'sptid'], df_bm5.at[65, 'name'] = '258533', 'Mycolicibacterium cosmeticum'
df_bm5.at[67, 'sptid'], df_bm5.at[67, 'name'] = '36809', 'Mycobacteroides abscessus'
df_bm5.at[68, 'sptid'], df_bm5.at[68, 'name'] = '404941', 'Mycobacteroides salmoniphilum'
df_bm5.at[69, 'sptid'], df_bm5.at[69, 'name'] = '212765', 'Mycolicibacterium madagascariense'
df_bm5.at[70, 'sptid'], df_bm5.at[70, 'name'] = '39691', 'Mycolicibacterium moriokaense'
df_bm5.at[74, 'sptid'], df_bm5.at[74, 'name'] = '228599', 'Nocardia higoensis'
df_bm5.at[75, 'sptid'], df_bm5.at[75, 'name'] = '228601', 'Nocardia pneumoniae'
df_bm5.at[76, 'sptid'], df_bm5.at[76, 'name'] = '228596', 'Nocardia shimofusensis'
df_bm5.at[77, 'sptid'], df_bm5.at[77, 'name'] = '455432', 'Nocardia terpenica'
df_bm5.at[78, 'sptid'], df_bm5.at[78, 'name'] = '257275', 'Nocardia thailandica'
df_bm5.at[88, 'sptid'], df_bm5.at[88, 'name'] = '286804', 'Segniliparus rugosus'
df_bm5.at[96, 'sptid'], df_bm5.at[96, 'name'] = '363630', 'Nakamurella flavida'
df_bm5.at[103, 'sptid'], df_bm5.at[103, 'name'] = '494023', 'Paeniglutamicibacter antarcticus'
df_bm5.at[105, 'sptid'], df_bm5.at[105, 'name'] = '85085', 'Pseudarthrobacter chlorophenolicus'
df_bm5.at[106, 'sptid'], df_bm5.at[106, 'name'] = '162496', 'Glutamicibacter creatinolyticus'
df_bm5.at[107, 'sptid'], df_bm5.at[107, 'name'] = '410837', 'Pseudarthrobacter defluvii'
df_bm5.at[110, 'sptid'], df_bm5.at[110, 'name'] = '211146', 'Paenarthrobacter nitroguajacolicus'
df_bm5.at[113, 'sptid'], df_bm5.at[113, 'name'] = '37930', 'Glutamicibacter protophormiae'
df_bm5.at[116, 'sptid'], df_bm5.at[116, 'name'] = '453836', 'Glutamicibacter soli'
df_bm5.at[119, 'sptid'], df_bm5.at[119, 'name'] = '121292', 'Pseudarthrobacter sulfonivorans'
df_bm5.at[120, 'sptid'], df_bm5.at[120, 'name'] = '43666', 'Paeniglutamicibacter sulfureus'
df_bm5.at[123, 'sptid'], df_bm5.at[123, 'name'] = '43667', 'Glutamicibacter uratoxydans'
df_bm5.at[127, 'sptid'], df_bm5.at[127, 'name'] = '37923', 'Rothia kristinae'
df_bm5.at[135, 'sptid'], df_bm5.at[135, 'name'] = '2047', 'Rothia dentocariosa'
df_bm5.at[141, 'sptid'], df_bm5.at[141, 'name'] = '417948', 'Brevibacterium album'
df_bm5.at[142, 'sptid'], df_bm5.at[142, 'name'] = '479117', 'Brevibacterium ravenspurgense'
df_bm5.at[170, 'sptid'], df_bm5.at[170, 'name'] = '412687', 'Intrasporangium oryzae'
df_bm5.at[177, 'sptid'], df_bm5.at[177, 'name'] = '82346', 'Ornithinicoccus hortensis'
df_bm5.at[178, 'sptid'], df_bm5.at[178, 'name'] = '436356', 'Pedococcus aerophilus'
df_bm5.at[179, 'sptid'], df_bm5.at[179, 'name'] = '587636', 'Pedococcus cremeus'
df_bm5.at[180, 'sptid'], df_bm5.at[180, 'name'] = '443156', 'Pedococcus dokdonensis'
df_bm5.at[212, 'sptid'], df_bm5.at[212, 'name'] = '88374', 'Agromyces rhizosphaerae'
df_bm5.at[217, 'sptid'], df_bm5.at[217, 'name'] = '110933', 'Leifsonia poae'
df_bm5.at[218, 'sptid'], df_bm5.at[218, 'name'] = '150026', 'Leifsonia shinshuensis'
df_bm5.at[219, 'sptid'], df_bm5.at[219, 'name'] = '1575', 'Leifsonia xyli'
df_bm5.at[220, 'sptid'], df_bm5.at[220, 'name'] = '1575', 'Leifsonia xyli'
df_bm5.at[221, 'sptid'], df_bm5.at[221, 'name'] = '381665', 'Herbiconiux ginsengi'
df_bm5.at[222, 'sptid'], df_bm5.at[222, 'name'] = '433641', 'Pseudolysinimonas kribbensis'
df_bm5.at[225, 'sptid'], df_bm5.at[225, 'name'] = '501483', 'Leucobacter tardus'
df_bm5.at[233, 'sptid'], df_bm5.at[233, 'name'] = '386302', 'Salinibacterium xinjiangense'
df_bm5.at[242, 'sptid'], df_bm5.at[242, 'name'] = '228973', 'Xylanibacterium ulmi'
df_bm5.at[246, 'sptid'], df_bm5.at[246, 'name'] = '648782', 'Ruania alba'
df_bm5.at[248, 'sptid'], df_bm5.at[248, 'name'] = '53388', 'Microlunatus antarcticus'
df_bm5.at[249, 'sptid'], df_bm5.at[249, 'name'] = '99117', 'Microlunatus capsulatus'
df_bm5.at[250, 'sptid'], df_bm5.at[250, 'name'] = '88568', 'Microlunatus lacustris'
df_bm5.at[260, 'sptid'], df_bm5.at[260, 'name'] = '374513', 'Nocardioides salarius'
df_bm5.at[264, 'sptid'], df_bm5.at[264, 'name'] = '363868', 'Nocardioides ginsengisoli'
df_bm5.at[271, 'sptid'], df_bm5.at[271, 'name'] = '2045', 'Pimelobacter simplex'
df_bm5.at[272, 'sptid'], df_bm5.at[272, 'name'] = '433654', 'Nocardioides tritolerans'
df_bm5.at[283, 'sptid'], df_bm5.at[283, 'name'] = '1382', 'Lancefieldella parvula'

Check duplicate species

In [201]:
df_bm5['sptid'].value_counts().value_counts()

count
1    288
2      1
Name: count, dtype: int64

Group organisms by species taxid and calculate geometric/arithmetic mean per species.

In [202]:
cols_gmean = ['length_gmean', 'width_gmean', 'volume_gmean', 'surface_gmean']
cols_amean = ['length_amean', 'width_amean', 'volume_amean', 'surface_amean']

In [203]:
tmp_gmean = df_bm5.groupby('sptid')[cols_gmean].agg(gmean)
tmp_amean = df_bm5.groupby('sptid')[cols_amean].agg('mean')

In [204]:
dfr_bm5 = pd.concat([tmp_gmean, tmp_amean], axis = 1)

Add shape, ranges, and species name

In [205]:
shapes = dict(df_bm5[['sptid', 'shape']].drop_duplicates('sptid').values)
dfr_bm5['shape'] = dfr_bm5.index.map(shapes)

In [206]:
ranges_length = map_ranges(df_bm5, 'length')
ranges_width = map_ranges(df_bm5, 'width')

In [207]:
dfr_bm5['length_ranges'] = dfr_bm5.index.map(ranges_length)
dfr_bm5['width_ranges'] = dfr_bm5.index.map(ranges_width)

In [208]:
dfr_bm5['species'] = dfr_bm5.index.to_series().apply(lambda x: dump[str(x)]['name'])

Organize

In [209]:
dfr_bm5 = dfr_bm5.loc[sorted(dfr_bm5.index, key = int)]

In [210]:
dfr_bm5.index.names = ['taxid']

In [211]:
dfr_bm5.shape

(289, 12)

Fill more ranks

In [212]:
ranks = ['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom']

In [213]:
res = {x: {} for x in ranks[1:]}

In [214]:
rankset = set(ranks[1:])

In [215]:
for tid in dfr_bm5.index.astype(str):
    this = tid
    while True:
        rank = dump[this]['rank']
        if rank in rankset:
            res[rank][tid] = dump[this]['name']
        parent = dump[this]['parent']
        if this == parent:
            break
        this = parent

In [216]:
for rank in ranks[1:]:
    dfr_bm5[rank] = dfr_bm5.index.map(res[rank])

In [217]:
# Correct entries with no species
dfr_bm5['species'] = dfr_bm5['species'].astype(str).apply(lambda x: x if len(x.split(' ')) > 1 else np.NaN)

Statistics

In [218]:
for rank in ranks:
    print(rank, dfr_bm5[rank].nunique())

species 289
genus 98
family 38
order 16
class 5
phylum 1
superkingdom 1


Save dataframe

In [219]:
dfr_bm5['source'] = 'bm5'

In [220]:
dfr_bm5.to_csv('preprocess/annot/species_bm5.tsv', sep = '\t')

# Concat data

In [221]:
df = pd.concat((dfr_bacd, dfr_pcc, dfr_pubmed, dfr_bm1, dfr_bm4, dfr_bm5), axis = 0)

In [222]:
df.shape

(5742, 19)

Check for duplicated data

**Why this data was not detected during pre-processing?**

Because during the preprocessing step I only checkd for names. After asigning a species name according to NCBI taxonomy, the name may have changed. Therefore two names that were different befor taxonomic assignation are now the same and they share the same taxID.

In [223]:
df.index[df.index.duplicated()].shape

(40,)

Group organisms by species taxid and calculate geometric/arithmetic mean per species.

In [224]:
cols_gmean = ['length_gmean', 'width_gmean', 'volume_gmean', 'surface_gmean']
cols_amean = ['length_amean', 'width_amean', 'volume_amean', 'surface_amean']

In [225]:
tmp_gmean = df.groupby('taxid')[cols_gmean].agg(gmean)
tmp_amean = df.groupby('taxid')[cols_amean].agg('mean')

In [226]:
dfr = pd.concat([tmp_gmean, tmp_amean], axis = 1)

Add shape, ranges, and species name

In [227]:
tmp = df[['shape']].reset_index()

In [228]:
shapes = dict(tmp.drop_duplicates('taxid', keep = 'last').values)

In [229]:
dfr['shape'] = dfr.index.map(shapes)

In [230]:
df[df.index.duplicated()].index

Index(['2242', '148', '160', '173', '2096', '2097', '150829', '157692',
       '215591', '458208', '1653855', '1661', '1764', '1773', '47312', '52768',
       '52773', '211146', '228596', '228599', '228601', '228973', '258533',
       '265949', '286804', '363630', '363868', '374513', '381665', '386302',
       '410837', '412687', '417948', '433641', '436356', '455432', '479117',
       '494023', '501483', '587636'],
      dtype='object', name='taxid')

In [231]:
def map_ranges(df, metric):
    ranges = {}
    for tid, row in df.iterrows():
        # tid = row['sptid']
        rng = row[f'{metric}_ranges']
        # print(tid, row[f'{metric}_ranges'], rng)
        # rng = [rng[0], rng[1]]
        if tid not in ranges.keys():
            ranges[tid] = rng
        else:
            for r in rng:
                ranges[tid].append(r)
    return ranges

In [232]:
ranges_length = map_ranges(df, 'length')
ranges_width = map_ranges(df, 'width')

In [233]:
dfr['length_ranges'] = dfr.index.map(ranges_length)
dfr['width_ranges'] = dfr.index.map(ranges_width)

Add source

In [234]:
def map_sources(df):
    sources = {}
    for tid, row in df.iterrows():
        source = row[f'source']
        if tid not in sources.keys():
            sources[tid] = [source]
        else:
            sources[tid].append(source)
    return sources

In [235]:
sources = map_sources(df)

In [236]:
dfr['sources'] = dfr.index.map(sources)

In [237]:
dfr['species'] = dfr.index.to_series().apply(lambda x: dump[str(x)]['name'])

Organize

In [238]:
dfr = dfr.loc[sorted(dfr.index, key = int)]

In [239]:
ranks = ['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom']

In [240]:
res = {x: {} for x in ranks[1:]}
rankset = set(ranks[1:])

In [241]:
for tid in dfr.index.astype(str):
    this = tid
    while True:
        rank = dump[this]['rank']
        if rank in rankset:
            res[rank][tid] = dump[this]['name']
        parent = dump[this]['parent']
        if this == parent:
            break
        this = parent

In [242]:
for rank in ranks[1:]:
    dfr[rank] = dfr.index.map(res[rank])

In [243]:
# Correct entries with no species
dfr['species'] = dfr['species'].astype(str).apply(lambda x: x if len(x.split(' ')) > 1 else np.NaN)

In [244]:
dfr.shape

(5702, 19)

Statistics

In [245]:
for rank in ranks:
    print(rank, dfr[rank].nunique())

species 5676
genus 1818
family 440
order 187
class 74
phylum 35
superkingdom 2


Save dataframe

In [246]:
dfr.to_csv('preprocess/annot/species_all.tsv', sep = '\t')