# Identify bacterial genera associated with metabolites

We will now select the microbes significantly associated with each metabolite. We will use the genus level here since that is usually the lowest taxonomic rank we can confidently identify with 16S amplicon sequencing.

In [29]:
import pandas as pd

train = pd.read_csv("data/train.csv")
abundance = pd.read_csv("/proj/arivale/microbiome/16S_processed/asvs.csv")
abundance["stool_sample_id"] = abundance.id.str.split("|").str[0]
abundance = abundance[abundance.stool_sample_id.isin(train.stool_sample_id)]

taxonomy = pd.read_csv("/proj/arivale/microbiome/16S_processed/taxonomy.csv")
taxa_compact = taxonomy.Family + "_" + taxonomy.Genus
taxa_compact.index = taxonomy.id
abundance["taxon"] = taxa_compact[abundance.hash].values
genera = abundance.groupby(["stool_sample_id", "taxon"]).sum().reset_index()
genera.taxon.nunique()

380

We also filter out genera that were not observed in at least 50% of all individuals and that have less than 10 detected reads per sample on average. 

In [30]:
prevalence = genera.groupby("taxon").apply(lambda df: sum(df["count"] > 0)) / genera.stool_sample_id.nunique()
means = genera.groupby("taxon")["count"].mean()
good = (prevalence > 0.5) & (means > 10)
genera_filtered = genera[genera.taxon.isin(good.index[good])]
genera_filtered.taxon.nunique()

81

Now we will CLR transform the abundances.

In [36]:
import numpy as np

genera_wide = pd.pivot_table(genera_filtered, index="stool_sample_id", columns="taxon", values="count", fill_value=0)
genera_wide_clr = genera_wide.apply(lambda x: np.log(x + 0.5) - np.mean(np.log(x + 0.5)))
genera_wide_clr.shape

(1000, 81)

In [38]:
metabolites_and_microbes = pd.merge(train, genera_wide_clr, left_on="stool_sample_id", right_index=True)

In [39]:
metabolites_and_microbes

Unnamed: 0,public_client_id,genome_id,blood_sample_id,blood_days_in_program,stool_sample_id,stool_days_in_program,sex,age,WEIGHT_CALC,BMI_CALC,...,Ruminococcaceae_Ruminococcaceae_UCG-014,Ruminococcaceae_Ruminococcus_1,Ruminococcaceae_Ruminococcus_2,Ruminococcaceae_Subdoligranulum,Ruminococcaceae_UBA1819,Streptococcaceae_Lactococcus,Streptococcaceae_Streptococcus,Tannerellaceae_Parabacteroides,Veillonellaceae_Dialister,Veillonellaceae_Veillonella
0,1241633,A789AU958-007,A789AU992-002,36.0,AV15-1655,36.0,M,71.0,179.0,24.729955,...,-5.025837,0.849859,-2.000973,1.843949,1.335272,0.263936,-0.069736,-1.389592,5.100654,1.624624
1,1020175,DS-268800,A430BH742-003,8.0,22001612561075,13.0,F,41.0,192.8,33.090430,...,-5.025837,0.490104,-6.078511,-0.824318,1.402187,1.849563,-1.035311,3.621856,4.423868,1.581139
2,1230573,DS-284622,A773BJ600-002,4.0,22001701512104,22.0,F,60.0,152.0,26.087891,...,-1.362275,-6.995949,-6.078511,-8.011976,1.632425,1.825172,-2.213101,2.340332,5.086527,-2.225523
3,1687802,A719AT743-004,A719AT785-002,88.0,AV15-1320,88.0,F,53.0,242.0,36.791955,...,-5.025837,0.936054,1.252549,-2.613813,0.360194,2.682996,-0.235593,-7.049074,-3.848192,1.979169
4,1864964,DS-267221,A862BF047-008,14.0,22001612561843,15.0,F,43.0,170.0,26.622856,...,-5.025837,1.885749,-0.627472,-1.779528,-0.897167,-2.569278,-1.536952,1.409854,-3.848192,1.141772
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1035670,DS-269430,A808BG205-003,33.0,22001612562579,9.0,F,33.0,155.2,29.321580,...,-5.025837,3.441837,3.753642,2.457990,1.233366,1.191922,-0.064480,1.032710,4.778573,1.744769
996,1013950,A752AT594-004,A752AT607-002,77.0,AV15-1319,77.0,F,42.0,126.0,21.625488,...,3.830539,0.962978,3.940491,1.451067,1.677659,-2.569278,1.884048,0.621820,-3.848192,-2.225523
997,1244110,A939AV031-007,A473AS982-002,52.0,AV15-1143,52.0,F,62.0,126.0,20.334711,...,3.747083,0.827697,2.398069,-0.190734,-0.667593,1.664829,0.845237,0.304008,4.915705,-2.225523
998,1329753,DS-265707,A579BF997-003,3.0,22001612560465,15.0,M,40.0,269.0,35.486395,...,1.098847,-0.006614,1.200118,-0.430766,-4.331154,-2.569278,-2.067009,-1.850577,-3.848192,-2.225523


In [54]:
from itertools import product
from rich.progress import track
from scipy.stats import pearsonr

mets = metabolites_and_microbes.columns[metabolites_and_microbes.columns.str.startswith("metabolite")]
mics = genera_wide_clr.columns

combinations = list(product(mets, mics))

metrics = pd.DataFrame(columns=["r", "p", "n"], dtype="float64", index=combinations)
formula = "{} ~ Q('{}')"
for comb in track(combinations):
    data = metabolites_and_microbes[list(comb)].dropna()
    ptest = pearsonr(data.iloc[:, 0], data.iloc[:, 1])
    metrics.loc[[comb], "r"] = ptest[0]
    metrics.loc[[comb], "p"] = ptest[1]
    metrics.loc[[comb], "n"] = data.shape[0]

Output()

In [65]:
from statsmodels.stats.multitest import fdrcorrection

metrics["q"] = fdrcorrection(metrics.p)[1]
metrics["r2"] = metrics.r.pow(2)
metrics["metabolite"] = metrics.index.map(lambda x: x[0])
metrics["taxon"] = metrics.index.map(lambda x: x[1])

In [66]:
metrics.sort_values(by="p").head(100)

Unnamed: 0_level_0,r,p,n,q,r2,metabolite,taxon
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"(metabolite_100002253, Family_XIII_Family_XIII_UCG-001)",0.584475,3.313048e-83,896.0,2.495719e-78,0.341611,metabolite_100002253,Family_XIII_Family_XIII_UCG-001
"(metabolite_100000010, Family_XIII_Family_XIII_UCG-001)",0.524624,5.722546e-63,874.0,2.155397e-58,0.275230,metabolite_100000010,Family_XIII_Family_XIII_UCG-001
"(metabolite_100002488, Ruminococcaceae_Ruminococcaceae_NK4A214_group)",-0.497297,2.960935e-59,930.0,7.434909e-55,0.247304,metabolite_100002488,Ruminococcaceae_Ruminococcaceae_NK4A214_group
"(metabolite_100002253, Ruminococcaceae_Ruminococcaceae_UCG-010)",0.456146,3.009236e-47,896.0,5.667144e-43,0.208069,metabolite_100002253,Ruminococcaceae_Ruminococcaceae_UCG-010
"(metabolite_100002253, Christensenellaceae_Christensenellaceae_R-7_group)",0.452545,1.916009e-46,896.0,2.886660e-42,0.204797,metabolite_100002253,Christensenellaceae_Christensenellaceae_R-7_group
...,...,...,...,...,...,...,...
"(metabolite_999946613, Family_XIII_Family_XIII_UCG-001)",0.261593,3.453344e-16,941.0,2.709796e-13,0.068431,metabolite_999946613,Family_XIII_Family_XIII_UCG-001
"(metabolite_100000014, Ruminococcaceae_Ruminococcaceae_UCG-003)",0.258759,4.178196e-16,957.0,3.244779e-13,0.066956,metabolite_100000014,Ruminococcaceae_Ruminococcaceae_UCG-003
"(metabolite_100002021, Ruminococcaceae_Ruminococcaceae_UCG-014)",0.277556,4.469519e-16,826.0,3.435601e-13,0.077037,metabolite_100002021,Ruminococcaceae_Ruminococcaceae_UCG-014
"(metabolite_1135, Christensenellaceae_Christensenellaceae_R-7_group)",-0.283722,5.331102e-16,785.0,4.056484e-13,0.080498,metabolite_1135,Christensenellaceae_Christensenellaceae_R-7_group


In [74]:
metrics[metrics.q < 0.1].to_csv("data/sig_metabolite_taxon.csv", index=False)