Translate the microbiome-metabolism correlation from mice to human. 
MICE - OSA
HUMAN - iHMP

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats

In [2]:
mice_significant_corr = pd.read_pickle('mice/haddad_osa/analysis/metabolome_bacteria_significant_corr_kendall-tau.pkl')
mice_corr_coefficient= pd.read_pickle('mice/haddad_osa/analysis/metabolome_bacteria_corr_coefficient_kendall-tau.pkl')
mice_p_values_fdr_correction = pd.read_pickle('mice/haddad_osa/analysis/metabolome_bacteria_p_values_fdr_correction_kendall-tau.pkl')

human_significant_corr = pd.read_pickle('human/iHMP_IBDMDB_2019/metabolome_bacteria_significant_corr_kendall-tau.pkl')
human_corr_coefficient = pd.read_pickle('human/iHMP_IBDMDB_2019/metabolome_bacteria_corr_coefficient_kendall-tau.pkl')
human_p_values_fdr_correction = pd.read_pickle('human/iHMP_IBDMDB_2019/metabolome_bacteria_p_values_fdr_correction_kendall-tau.pkl')

In [3]:
# Keep only the genus label for simplicity: 

taxa_rename_to_genus = pd.Series(human_corr_coefficient.index.get_level_values(1).unique(), index=human_corr_coefficient.index.get_level_values(1).unique()).str.extract('.*g__(.*)').squeeze()
human_significant_corr = human_significant_corr.rename(taxa_rename_to_genus.to_dict(), axis=0, level=1)
human_corr_coefficient = human_corr_coefficient.rename(taxa_rename_to_genus.to_dict(), axis=0, level=1)
human_p_values_fdr_correction = human_p_values_fdr_correction.rename(taxa_rename_to_genus.to_dict(), axis=0, level=1)


Analysis 

How many genus are shared (mice - human)? 
(Note, when calculating the discrete matrix - I should also present the baseline (aka how many genus are shared without any translation)

In [4]:
human_genus = human_corr_coefficient.index.get_level_values(1).unique()
mice_genus = mice_corr_coefficient.index.get_level_values(1).unique()

In [5]:
print(f"There are {len(mice_genus.intersection(human_genus))} genus shared in the mice and human data. \n"
      f"Out of {len(mice_genus)} genus in mice and {len(human_genus)} genus in human. \n "
      f"AKA {round(len(mice_genus.intersection(human_genus)) / len(mice_genus) * 100, 2)} % of the genus in mice are shared with humans in our ds.")

There are 16 genus shared in the mice and human data. 
Out of 39 genus in mice and 107 genus in human. 
 AKA 41.03 % of the genus in mice are shared with humans in our ds.


Shared metabolites:

In [6]:
# intersection
mice_metabolites = mice_corr_coefficient.index.get_level_values(0).unique()
human_metabolites = human_corr_coefficient.index.get_level_values(0).unique()
shared_metabolites = human_metabolites.intersection(mice_metabolites)


In [7]:
shared_metabolites.shape

(25,)

Taxonomic distance mapping based on MGBC closest taxa table 

In [8]:
from translator import MGBC_Translator, Direction, DatabaseClosestTaxa, AggregationFunction
m2h_translator = MGBC_Translator(direction=Direction.M2H, use_database=DatabaseClosestTaxa.taxonomy, aggregation_function=AggregationFunction.majority, short_taxonomic_naming=True)
translation_map = m2h_translator.translation_map()

How many genus mapped to the same genus?

In [17]:
print(f"Generally: {np.count_nonzero(translation_map.index == translation_map)}, out of {translation_map.shape[0]}. {np.count_nonzero(translation_map.index == translation_map) / translation_map.shape[0] * 100:.2f}%")


print(f"Generally: {np.count_nonzero(translation_map.index == translation_map)}, out of {translation_map.shape[0]}. {np.count_nonzero(translation_map.index == translation_map) / translation_map.shape[0] * 100:.2f}%")

Generally: 172, out of 225. 76.44%


In [19]:
translation_map.index.intersection(mice_genus) 

Index(['NM07-P-09', 'CAG-485', 'Duncaniella', 'Erysipelatoclostridium',
       'Faecalibaculum', 'Turicibacter', 'Lactococcus', 'Clostridium',
       'Anaerotignum', '14-2', '1XD42-69', 'Acetatifactor', 'CAG-56', 'COE1',
       'Clostridium_Q', 'Eubacterium_J', 'Kineothrix', 'Roseburia',
       'Schaedlerella', 'Acutalibacter', 'Eubacterium_R', 'Intestinimonas',
       'Lawsonibacter', 'Marseille-P3106', 'Anaerotruncus', 'Angelakisella',
       'Emergencia', 'Romboutsia'],
      dtype='object')

Not all genus found in Mice are found in this MGBC translation toolkit? 

In [26]:
# TODO: Verify! Not all genus found in Mice in my dataset is in this MGBC translation toolkit? 

mice_genus


Index(['14-2', '1XD42-69', 'Acetatifactor', 'Acutalibacter', 'Anaerotignum',
       'Anaerotruncus', 'Angelakisella', 'CAG-317', 'CAG-485', 'CAG-56',
       'COE1', 'Caccenecus', 'Choladocola', 'Clostridium', 'Clostridium_Q',
       'Copromonas', 'Duncaniella', 'Dysosmobacter', 'Emergencia',
       'Enterenecus', 'Erysipelatoclostridium', 'Eubacterium_J',
       'Eubacterium_R', 'Faecalibaculum', 'Intestinimonas', 'Kineothrix',
       'Lachnoclostridium_B', 'Lactococcus', 'Lawsonibacter',
       'Marseille-P3106', 'Merdisoma', 'NM07-P-09', 'Pelethomonas',
       'Romboutsia', 'Roseburia', 'Schaedlerella', 'Sporofaciens',
       'Turicibacter', 'UBA7109'],
      dtype='object')

In [9]:
mgbc_closest_taxa_table = pd.read_csv('/home/noa/lab_code/MGBC-Toolkit/data/closest_tax.tsv', sep='\t', header=None,
                                         names=['method', 'reference_genome', 'query_genome', 'distance',
                                                'reference_taxonomy', 'query_taxonomy'])


In [28]:
mgbc_closest_taxa_table

Unnamed: 0,method,reference_genome,query_genome,distance,reference_taxonomy,query_taxonomy
0,taxonomy,GUT_GENOME000010,MGBC000100,0.020884,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__O...,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__O...
1,taxonomy,GUT_GENOME000035,MGBC000465,0.009662,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...
2,taxonomy,GUT_GENOME000049,MGBC109121,0.200162,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...
3,taxonomy,GUT_GENOME000057,MGBC115383,0.008205,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...
4,taxonomy,GUT_GENOME000064,MGBC161554,0.625505,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...
...,...,...,...,...,...,...
61495,Reactome.ips,MGBC166864,GUT_GENOME188058,0.116968,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__C...
61496,Reactome.ips,MGBC167004,GUT_GENOME012681,0.134689,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...
61497,Reactome.ips,MGBC167061,GUT_GENOME205920,0.058195,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...
61498,Reactome.ips,MGBC167064,GUT_GENOME258050,0.129410,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...


In [10]:
# mgbc_tax_rep_index = pd.read_csv('/home/noa/lab_code/MGBC-Toolkit/data/tax_rep_index.tsv', sep='\t', header=None
#                                  )

mgbc_tax_rep_index = pd.read_csv('/home/noa/lab_code/MGBC-Toolkit/data/tax_rep_index.tsv', sep='\t', header=None,
                                 names=['pid','name', 'taxonomy level', 'taxonomy_label', 'source_pid', 'host', 'number'])

In [41]:
mgbc_tax_rep_index[mgbc_tax_rep_index.host == 'MOUSE'].taxonomy_label.unique().shape

(436,)

In [60]:
mgbc_tax_rep_index[mgbc_tax_rep_index.host == 'MOUSE'].taxonomy_label.unique()


array(['d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales;f__Lachnospiraceae;g__Schaedlerella;s__',
       'd__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Muribaculaceae;g__CAG-485;s__CAG-485 sp002362485',
       'd__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus',
       'd__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales;f__Lachnospiraceae;g__Schaedlerella;s__Schaedlerella sp000364245',
       'd__Bacteria;p__Firmicutes_A;c__Clostridia;o__Lachnospirales;f__Lachnospiraceae;g__Dorea;s__',
       'd__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__Bacteroides acidifaciens',
       'd__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Muribaculaceae;g__Muribaculum;s__Muribaculum sp002358615',
       'd__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus;s__Lactobacillus intestinalis',
       'd_

In [52]:
mgbc_genus = mgbc_tax_rep_index[mgbc_tax_rep_index.host == 'MOUSE'].taxonomy_label.str.extract(
            '.*;g__([^;]+).*').squeeze().unique()

In [58]:
mgbc_genus.shape

(228,)

In [56]:
np.count_nonzero(~mice_genus.isin(mgbc_genus)) 

11

In [55]:
np.count_nonzero(~mice_genus.isin(mgbc_genus)) / mice_genus.shape[0]

0.28205128205128205

In [57]:
mice_genus[~mice_genus.isin(mgbc_genus)]

Index(['CAG-317', 'Caccenecus', 'Choladocola', 'Copromonas', 'Dysosmobacter',
       'Enterenecus', 'Lachnoclostridium_B', 'Merdisoma', 'Pelethomonas',
       'Sporofaciens', 'UBA7109'],
      dtype='object')

Quite a lot of the taxa of mice (28%, 11 genus) do not appear in MGBC cohort (in their full cohort MGBC pids -> GTDB ids). 

This problem implies that we have to assign the taxonomy based on the MGBC and UHGG datasets if we want to use this mapping. 


Does the tax_rep_index contain all the mice genus? yes, and human too.

Note: we yse tax_rep_index file for this analyse. This file contain a mapping from the MGBC and UHGG cohort to GTDB labels, and mapping from HUGG original pids to the pids use in this research (as they select a sub-sample according to their own thershold selection). 

In [63]:
# We have taxonomic mapping from MGBC pid to GTDB label to the hole 26,640 MGBC pids. 
# (26,640 match the numbers in the MGBC paper of non-redundant high-quality bacterial genomes in their cohort. )


mgbc_tax_rep_index[mgbc_tax_rep_index.host == 'MOUSE'].pid.unique().shape

(26640,)

In [68]:
mgbc_tax_rep_index[mgbc_tax_rep_index.host == 'HUMAN'].pid.unique().shape
# (100,456 match the numbers in the MGBC paper of non-redundant high-quality bacterial genomes in the UHGG cohort after their pre-processing.)


(100456,)

In [16]:
mgbc_tax_rep_index[mgbc_tax_rep_index.host == 'HUMAN'].taxonomy_label.unique().shape

(2585,)

In [67]:
mgbc_tax_rep_index

Unnamed: 0,pid,name,taxonomy level,taxonomy_label,source_pid,host,number
0,GUT_GENOME000002,s__Anaerobutyricum hallii,species,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...,GUT_GENOME001689,HUMAN,206
1,GUT_GENOME000003,s__Blautia_A wexlerae,species,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...,GUT_GENOME096067,HUMAN,533
2,GUT_GENOME000005,s__Mediterraneibacter faecis,species,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...,GUT_GENOME001575,HUMAN,186
3,GUT_GENOME000006,s__Tidjanibacter inops,species,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...,GUT_GENOME257982,HUMAN,271
4,GUT_GENOME000007,s__Alistipes onderdonkii,species,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...,GUT_GENOME261564,HUMAN,928
...,...,...,...,...,...,...,...
127091,MGBC167524,s__Prevotella sp002933775,species,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...,MGBC167141,MOUSE,216
127092,MGBC167525,g__COE1,genus,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...,MGBC111351,MOUSE,166
127093,MGBC167526,f__Lachnospiraceae,family,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...,MGBC114200,MOUSE,37
127094,MGBC167527,s__CAG-485 sp002491945,species,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...,MGBC118190,MOUSE,60


Do we have the human genus level in this dataset? No. HUGG contain only 84 genus level out of the 107 that we have in our human-dataset. (78%) 

In [100]:
mgbc_human_genus = mgbc_tax_rep_index[mgbc_tax_rep_index.host == 'HUMAN'].taxonomy_label.str.extract(
            '.*;g__([^;]+).*').squeeze().unique()

In [101]:
mgbc_human_genus.shape


(770,)

In [102]:
np.count_nonzero(pd.Index(human_genus).isin(mgbc_human_genus))


84

In [104]:
len(human_genus)

107

In [103]:
np.count_nonzero(pd.Index(human_genus).isin(mgbc_human_genus)) / len(human_genus)


0.7850467289719626

Closest taxa contain only 1094 MGBC pids. Maybe I'm missing something and this is only an example? Try to solve this issue... (Dive deeper into MGBC tomorrow). I think there are different version of this toolkit, maybe try to download the hole dataset? different version? Debug. 

We could also just use the mapping from MGBC pid to GTDB label and UHGG to pid label to mark the GTDB tree and extract taxonomic distance based on that (in case we don't find the hole data. Then we could consider writing to them / re-implementing their code for the functional distance...)

Does all the MGBC pids are in the closest_tax table?

closest_taxa table contain only 1094 MGBC pids (out of the 26K) 
and 3006 from human (out of 100K) so it's very partial data...

In [66]:
mgbc_closest_taxa_table.query('reference_genome.str.contains("MGBC")').reference_genome.unique().shape

(1094,)

In [76]:
mgbc_closest_taxa_table.query('reference_genome.str.contains("GUT_GENOME")').reference_genome.unique().shape


(3006,)

In [14]:
mgbc_closest_taxa_table.query('reference_genome.str.contains("MGBC")').reference_taxonomy.unique().shape

(430,)

In [15]:
mgbc_closest_taxa_table.query('reference_genome.str.contains("GUT_GENOME")').reference_taxonomy.unique().shape


(2585,)

In [84]:
# The mice genus that we have mapping for:
mice_genus_with_mapping = mgbc_closest_taxa_table.query('reference_genome.str.contains("MGBC")').reference_taxonomy.str.extract(
            '.*;g__([^;]+).*').squeeze().unique()

So genus level (representation wise) we have a mapping (according to some Genome) to 226 out of 228 genus in the mice dataset

In [85]:
mice_genus_with_mapping.shape

(226,)

In [86]:
mgbc_genus.shape

(228,)

In [89]:
np.count_nonzero(pd.Index(mgbc_genus).isin(mice_genus_with_mapping))

226

In [90]:
# The genus that are missing in the closest taxa table are:

pd.Index(mgbc_genus)[~pd.Index(mgbc_genus).isin(mice_genus_with_mapping)]

Index(['UMGS363', 'SZUA-378'], dtype='object')

How much does the mapping changes between the different hueristics?

In [17]:
mgbc_closest_taxa_table

Unnamed: 0,method,reference_genome,query_genome,distance,reference_taxonomy,query_taxonomy
0,taxonomy,GUT_GENOME000010,MGBC000100,0.020884,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__O...,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__O...
1,taxonomy,GUT_GENOME000035,MGBC000465,0.009662,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...
2,taxonomy,GUT_GENOME000049,MGBC109121,0.200162,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...
3,taxonomy,GUT_GENOME000057,MGBC115383,0.008205,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...
4,taxonomy,GUT_GENOME000064,MGBC161554,0.625505,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...
...,...,...,...,...,...,...
61495,Reactome.ips,MGBC166864,GUT_GENOME188058,0.116968,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__C...
61496,Reactome.ips,MGBC167004,GUT_GENOME012681,0.134689,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...
61497,Reactome.ips,MGBC167061,GUT_GENOME205920,0.058195,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...
61498,Reactome.ips,MGBC167064,GUT_GENOME258050,0.129410,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...


In [18]:
mgbc_closest_taxa_table['reference_genus'] = mgbc_closest_taxa_table.reference_taxonomy.str.extract(
            '(.*;g__[^;]+).*')
mgbc_closest_taxa_table['query_genus'] = mgbc_closest_taxa_table.query_taxonomy.str.extract('(.*;g__[^;]+).*')

In [19]:
m2h = mgbc_closest_taxa_table.query('reference_genome.str.contains("MGBC")')

In [21]:
m2h.method.unique()

array(['taxonomy', 'all_annotations', 'CAZY.eggnog', 'ENZYME.eggnog',
       'GO.eggnog', 'GO.ips', 'InterPro.FAMILY', 'InterPro.ips',
       'KEGG.eggnog', 'MetaCyc.ips', 'MODULE.eggnog', 'PATHWAY_eggnog',
       'PATHWAY_ips', 'REACTION.eggnog', 'Reactome.ips'], dtype=object)

In [46]:
from translator import majority_vote
translation_maps = m2h.groupby('method').apply(lambda df: df.groupby('reference_genus').apply(lambda x: majority_vote(x)))
translation_maps = translation_maps.T 


In [47]:
translation_maps.eq(translation_maps.iloc[:, 0], axis=0)

method,CAZY.eggnog,ENZYME.eggnog,GO.eggnog,GO.ips,InterPro.FAMILY,InterPro.ips,KEGG.eggnog,MODULE.eggnog,MetaCyc.ips,PATHWAY_eggnog,PATHWAY_ips,REACTION.eggnog,Reactome.ips,all_annotations,taxonomy
reference_genus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
d__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Actinomycetales;f__Actinomycetaceae;g__Actinomyces,True,True,True,True,True,True,True,True,True,True,False,True,True,True,True
d__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Actinomycetales;f__Bifidobacteriaceae;g__Bifidobacterium,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
d__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Actinomycetales;f__Microbacteriaceae;g__Microbacterium,True,False,True,True,False,False,True,False,True,False,False,False,False,True,False
d__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Mycobacteriales;f__Mycobacteriaceae;g__Corynebacterium,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
d__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Propionibacteriales;f__Propionibacteriaceae;g__Cutibacterium,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
d__Bacteria;p__Spirochaetota;c__Brachyspirae;o__Brachyspirales;f__Brachyspiraceae;g__Brachyspira,True,True,True,True,True,True,True,True,True,True,False,True,True,True,True
d__Bacteria;p__Spirochaetota;c__Spirochaetia;o__Treponematales;f__Treponemataceae;g__Treponema_D,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
d__Bacteria;p__Thermotogota;c__Thermotogae;o__Petrotogales;f__Kosmotogaceae;g__Kosmotoga_B,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
d__Bacteria;p__Verrucomicrobiota;c__Verrucomicrobiae;o__Verrucomicrobiales;f__Akkermansiaceae;g__Akkermansia,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True


In [48]:
# Is all the method agree on the mapping? (With respect to genus level?) 
translation_maps.eq(translation_maps.iloc[:, 0], axis=0).all(axis=1).all()

False

In [49]:
# How many genus-level are the same for any distance metric? 
translation_maps.eq(translation_maps.iloc[:, 0], axis=0).all(axis=1).sum()

68

In [50]:
translation_maps.eq(translation_maps.iloc[:, 0], axis=0).all(axis=1).sum() / translation_maps.shape[0] # less than 1/3 ! 

0.3022222222222222

How much taxonomy and function differ?

In [51]:
(translation_maps['taxonomy'] == translation_maps['all_annotations']).sum()

177

In [52]:
(translation_maps['taxonomy'] == translation_maps['all_annotations']).sum() / translation_maps.shape[0] # Taxonomy and functional (based on all_annotations) mostly agree (agree in 177 out of 226 of the genus, which is 78%)

0.7866666666666666

In [53]:
translation_maps[(translation_maps['taxonomy'] != translation_maps['all_annotations'])][['taxonomy', 'all_annotations']]

method,taxonomy,all_annotations
reference_genus,Unnamed: 1_level_1,Unnamed: 2_level_1
d__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Actinomycetales;f__Microbacteriaceae;g__Microbacterium,d__Bacteria;p__Actinobacteriota;c__Actinomycet...,d__Bacteria;p__Actinobacteriota;c__Actinomycet...
d__Bacteria;p__Actinobacteriota;c__Coriobacteriia;o__Coriobacteriales;f__Eggerthellaceae;g__D16-34,d__Bacteria;p__Actinobacteriota;c__Coriobacter...,d__Bacteria;p__Actinobacteriota;c__Coriobacter...
d__Bacteria;p__Actinobacteriota;c__Coriobacteriia;o__Coriobacteriales;f__Eggerthellaceae;g__D16-63,,d__Bacteria;p__Actinobacteriota;c__Coriobacter...
d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Muribaculaceae;g__M3,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...
d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__Rikenella,,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...
d__Bacteria;p__Cyanobacteria;c__Vampirovibrionia;o__Gastranaerophilales;f__Gastranaerophilaceae;g__Zag1,d__Bacteria;p__Cyanobacteria;c__Vampirovibrion...,d__Bacteria;p__Cyanobacteria;c__Vampirovibrion...
d__Bacteria;p__Deferribacterota;c__Deferribacteres;o__Deferribacterales;f__Mucispirillaceae;g__Mucispirillum,,d__Bacteria;p__Campylobacterota;c__Campylobact...
d__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Bacillaceae_A;g__Bacillus_AC,d__Bacteria;p__Firmicutes;c__Bacilli;o__Bacill...,d__Bacteria;p__Firmicutes;c__Bacilli;o__Bacill...
d__Bacteria;p__Firmicutes;c__Bacilli;o__Erysipelotrichales;f__Erysipelatoclostridiaceae;g__CHKCI006,d__Bacteria;p__Firmicutes;c__Bacilli;o__Erysip...,d__Bacteria;p__Firmicutes;c__Bacilli;o__Erysip...
d__Bacteria;p__Firmicutes;c__Bacilli;o__Erysipelotrichales;f__Erysipelotrichaceae;g__Allobaculum,d__Bacteria;p__Firmicutes;c__Bacilli;o__Erysip...,d__Bacteria;p__Firmicutes;c__Bacilli;o__Erysip...


In [72]:
translation_maps[(translation_maps['taxonomy'] != translation_maps['all_annotations'])][['taxonomy', 'all_annotations']].isna().any(axis=1).sum()

10

In [81]:
translation_maps.index

Index(['d__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Actinomycetales;f__Actinomycetaceae;g__Actinomyces',
       'd__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Actinomycetales;f__Bifidobacteriaceae;g__Bifidobacterium',
       'd__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Actinomycetales;f__Microbacteriaceae;g__Microbacterium',
       'd__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Mycobacteriales;f__Mycobacteriaceae;g__Corynebacterium',
       'd__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Propionibacteriales;f__Propionibacteriaceae;g__Cutibacterium',
       'd__Bacteria;p__Actinobacteriota;c__Coriobacteriia;o__Coriobacteriales;f__Atopobiaceae;g__NM07-P-09',
       'd__Bacteria;p__Actinobacteriota;c__Coriobacteriia;o__Coriobacteriales;f__Eggerthellaceae;g__Adlercreutzia',
       'd__Bacteria;p__Actinobacteriota;c__Coriobacteriia;o__Coriobacteriales;f__Eggerthellaceae;g__CAG-1427',
       'd__Bacteria;p__Actinobacteriota;c__Coriobacteriia;o__Coriobacterial

When they don't agree, 
Sometimes (10/48) it is because in one annotation there is nan values in the comparison.
And sometimes they are really different.


How much they differ given the genus we have in our OSA dataset (

In [88]:
translation_maps = translation_maps.rename(
                index=pd.Series(translation_maps.index.unique(), index=translation_maps.index.unique()).str.extract(
                    '.*g__(.*)').squeeze())

In [99]:
translation_maps_subset = translation_maps.loc[translation_maps.index.intersection(mice_genus), :]

In [102]:
translation_maps_subset.eq(translation_maps_subset.iloc[:, 0], axis=0).all(axis=1).sum()

6

In [103]:
(translation_maps_subset['taxonomy'] == translation_maps_subset['all_annotations']).sum()


20

In [104]:
20/28

0.7142857142857143

In [91]:
mice_genus.shape

(39,)

In [94]:
human_genus.shape

(107,)

In [93]:
translation_maps.index.intersection(mice_genus).shape

(28,)

In [95]:
28/39

0.717948717948718

In [20]:
m2h

Unnamed: 0,method,reference_genome,query_genome,distance,reference_taxonomy,query_taxonomy,reference_genus,query_genus
45090,taxonomy,MGBC000001,GUT_GENOME001757,0.231014,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...
45091,taxonomy,MGBC000003,GUT_GENOME001757,0.201808,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...
45092,taxonomy,MGBC000011,GUT_GENOME239724,0.032134,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...
45093,taxonomy,MGBC000013,GUT_GENOME096523,0.137890,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...
45094,taxonomy,MGBC000018,GUT_GENOME145982,0.001347,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...
...,...,...,...,...,...,...,...,...
61495,Reactome.ips,MGBC166864,GUT_GENOME188058,0.116968,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__C...,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__C...
61496,Reactome.ips,MGBC167004,GUT_GENOME012681,0.134689,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...
61497,Reactome.ips,MGBC167061,GUT_GENOME205920,0.058195,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...
61498,Reactome.ips,MGBC167064,GUT_GENOME258050,0.129410,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...
