# Identify bacterial genera associated with metabolites

We will now select the microbes significantly associated with each metabolite. We will use the genus level here since that is usually the lowest taxonomic rank we can confidently identify with 16S amplicon sequencing.

In [2]:
import pandas as pd

train = pd.read_csv("data/train.csv")
valid = pd.read_csv("data/valid.csv")
abundance = pd.read_csv("/proj/arivale/microbiome/16S_processed/asvs.csv")
abundance["stool_sample_id"] = abundance.id.str.split("|").str[0]
abundance = abundance[abundance.stool_sample_id.isin(train.stool_sample_id) | abundance.stool_sample_id.isin(valid.stool_sample_id)]

taxonomy = pd.read_csv("/proj/arivale/microbiome/16S_processed/taxonomy.csv")
taxa_compact = taxonomy.Family + "|" + taxonomy.Genus
taxa_compact.index = taxonomy.id
abundance["taxon"] = taxa_compact[abundance.hash].values
genera = abundance.groupby(["stool_sample_id", "taxon"]).sum().reset_index()
genera.taxon.nunique()

410

We also filter out genera that were not observed in at least 50% of all individuals and that have less than 10 detected reads per sample on average. 

In [3]:
prevalence = genera.groupby("taxon").apply(lambda df: sum(df["count"] > 0)) / genera.stool_sample_id.nunique()
means = genera.groupby("taxon")["count"].mean()
good = (prevalence > 0.5) & (means > 10)
genera_filtered = genera[genera.taxon.isin(good.index[good])]
genera_filtered.taxon.nunique()

82

Now we will CLR transform the abundances.

In [4]:
import numpy as np

def clr(x, pseudo):
    if pseudo == 0:
        out = x.copy()
        out[x == 0] = float("nan")
        out[x > 0] = np.log(x[x > 0]) - np.mean(np.log(x[x > 0])) 
    else:
        out = np.log(x + pseudo) - np.mean(np.log(x + pseudo))
    return out

genera_wide = pd.pivot_table(genera_filtered, index="stool_sample_id", columns="taxon", values="count", fill_value=0)
genera_wide_clr = genera_wide.apply(lambda x: clr(x, 1), axis=1)
genera_wide_clr.to_csv("data/genera_clr_filtered.csv")

genera_wide_clr_drop = genera_wide.apply(lambda x: clr(x, 0), axis=1)
genera_wide_clr_drop.to_csv("data/genera_clr_filtered_nonzero.csv")
genera_wide_clr.shape

(1569, 82)

In [5]:
metabolites_and_microbes = pd.merge(train, genera_wide_clr, left_on="stool_sample_id", right_index=True)

In [6]:
metabolites_and_microbes

Unnamed: 0,public_client_id,genome_id,blood_sample_id,blood_days_in_program,stool_sample_id,stool_days_in_program,sex,age,stool_vendor,WEIGHT_CALC,...,Ruminococcaceae|Ruminococcaceae_UCG-014,Ruminococcaceae|Ruminococcus_1,Ruminococcaceae|Ruminococcus_2,Ruminococcaceae|Subdoligranulum,Ruminococcaceae|UBA1819,Streptococcaceae|Lactococcus,Streptococcaceae|Streptococcus,Tannerellaceae|Parabacteroides,Veillonellaceae|Dialister,Veillonellaceae|Veillonella
0,1241633,A789AU958-007,A789AU992-002,36.0,AV15-1655,36.0,M,71.0,SecondGenome,179.0,...,-3.462865,3.690187,-0.061668,5.699964,1.513869,-1.265641,2.477306,1.506948,4.792963,-0.284811
1,1020175,DS-268800,A430BH742-003,8.0,22001612561075,13.0,F,41.0,DNAGenotek,192.8,...,-4.151371,2.642095,-4.151371,2.343894,0.892054,-0.413702,0.825362,5.826435,3.427797,-1.015877
2,1230573,DS-284622,A773BJ600-002,4.0,22001701512104,22.0,F,60.0,DNAGenotek,152.0,...,-1.840533,-4.836265,-4.836265,-4.836265,0.436734,-1.122693,-1.029603,3.860078,3.405438,-4.836265
3,1687802,A719AT743-004,A719AT785-002,88.0,AV15-1320,88.0,F,53.0,SecondGenome,242.0,...,-3.409260,3.829955,3.229308,1.300270,0.598073,1.155088,2.365292,-3.409260,-3.409260,0.117101
4,1864964,DS-267221,A862BF047-008,14.0,22001612561843,15.0,F,43.0,DNAGenotek,170.0,...,-3.535311,4.653378,1.226863,2.005952,-0.762722,-3.535311,0.942026,4.230682,-3.535311,-0.827261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1035670,DS-269430,A808BG205-003,33.0,22001612562579,9.0,F,33.0,DNAGenotek,155.2,...,-5.483529,4.261138,3.655530,4.293317,-0.608332,-2.392487,0.461891,1.905417,2.450267,-2.187693
996,1013950,A752AT594-004,A752AT607-002,77.0,AV15-1319,77.0,F,42.0,SecondGenome,126.0,...,3.016214,2.118971,4.178741,3.622815,0.170962,-5.147158,2.745668,1.831056,-5.147158,-5.147158
997,1244110,A939AV031-007,A473AS982-002,52.0,AV15-1143,52.0,F,62.0,SecondGenome,126.0,...,2.807030,1.858001,2.510743,1.855598,-2.277166,-1.717550,1.581457,1.387677,2.798008,-5.272898
998,1329753,DS-265707,A579BF997-003,3.0,22001612560465,15.0,M,40.0,DNAGenotek,269.0,...,2.352577,3.215964,3.505026,3.807427,-3.081145,-3.081145,0.870099,1.429714,-3.081145,-3.081145


In [7]:
from itertools import product
from rich.progress import track
from scipy.stats import pearsonr

mets = metabolites_and_microbes.columns[metabolites_and_microbes.columns.str.startswith("metabolite")]
mics = genera_wide_clr.columns

combinations = list(product(mets, mics))

metrics = pd.DataFrame(columns=["r", "p", "n"], dtype="float64", index=combinations)
formula = "{} ~ Q('{}')"
for comb in track(combinations):
    data = metabolites_and_microbes[list(comb)].dropna()
    ptest = pearsonr(data.iloc[:, 0], data.iloc[:, 1])
    metrics.loc[[comb], "r"] = ptest[0]
    metrics.loc[[comb], "p"] = ptest[1]
    metrics.loc[[comb], "n"] = data.shape[0]

Output()

KeyboardInterrupt: 

In [None]:
from statsmodels.stats.multitest import fdrcorrection

metrics["q"] = fdrcorrection(metrics.p)[1]
metrics["r2"] = metrics.r.pow(2)
metrics["metabolite"] = metrics.index.map(lambda x: x[0])
metrics["taxon"] = metrics.index.map(lambda x: x[1])

In [None]:
metrics.sort_values(by="p").head(100)

In [None]:
metrics[metrics.q < 0.1].to_csv("data/sig_metabolite_taxon.csv", index=False)

In [None]:
metrics[metrics.q < 0.05].metabolite.nunique()