In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from data.taxonomy_normalizer import NaiveTaxonomyNormalizer, preprocess_filter_rare_taxa


Load data from Haddad OSA experiment:

In [2]:
taxa = pd.read_csv('taxonomic_observed_abundance_HaddadOSA.csv').set_index('#SampleID')

metadata = pd.read_csv('relevant_metadata_haddad_osa.csv').set_index('#SampleID')

metabolite_features = pd.read_csv('metabolite_unique_gnp_annotated_HaddadOSA.csv').rename(columns={'Unnamed: 0': '#SampleID'}).set_index('#SampleID')

filter only the control samples from both the metabolite and the taxonomy using the metadata:


In [3]:
control_samples = metadata[metadata.control].index
metabolite_features = metabolite_features.loc[control_samples, :]
taxa = taxa.loc[control_samples, :]

Prep taxa: calculate relative-abundance, consider if to drop unknown or not


In [4]:
DROP_UNKNOWN_TAXA = False
UNKNOWN_TH = 1.0
naive_normalizer = NaiveTaxonomyNormalizer(drop_unknown_taxa=DROP_UNKNOWN_TAXA, unknown_taxa_sample_threshold=UNKNOWN_TH)
taxa = preprocess_filter_rare_taxa(taxa)
relative_abundance = naive_normalizer.normalize(taxa)


Prep metabolite:

In [5]:
from preprocessors import MetabolitePreprocessor
metabolite_preprocessor = MetabolitePreprocessor()
metabolite_features = metabolite_preprocessor.preprocess(metabolite_features)


Correlation test:
For each metabolite independently, and for each taxa independently, (aka for each (metabolite, genus) tuple) calculate correlation across samples. 

In [6]:
metabolite_features = pd.read_csv('metabolite_HMDB_annotated_HaddadOSA.csv', index_col='Unnamed: 0')
metabolite_features = metabolite_features.loc[control_samples, :]


In [7]:
from preprocessors import MetabolitePreprocessor
metabolite_preprocessor = MetabolitePreprocessor()
metabolite_features = metabolite_preprocessor.preprocess(metabolite_features)

In [8]:
metabolome_bacteria_corr_coefficient = pd.DataFrame(index=relative_abundance.columns, columns=metabolite_features.columns)
metabolome_bacteria_p_values = pd.DataFrame(index=relative_abundance.columns, columns=metabolite_features.columns)

for metabolome in metabolite_features.columns:
    for bacteria in relative_abundance.columns:
        coefficient, p  = stats.pearsonr(metabolite_features[metabolome], relative_abundance[bacteria])
        metabolome_bacteria_corr_coefficient.loc[bacteria, metabolome] = coefficient
        metabolome_bacteria_p_values.loc[bacteria, metabolome] = p
        
# Dropping the "Unknown"
metabolome_bacteria_corr_coefficient = metabolome_bacteria_corr_coefficient.drop(index=['Unknown'])
metabolome_bacteria_p_values = metabolome_bacteria_p_values.drop(index=['Unknown'])

In [26]:
metabolome_bacteria_p_values.stack().hist()

In [9]:
# With multi-comparison correction (FDR)
from statsmodels.stats.multitest import fdrcorrection
# what to do with nans?
p_values = metabolome_bacteria_p_values.stack()
_, p_values_corrected = fdrcorrection(p_values)
metabolome_bacteria_p_values_fdr_correction = pd.Series(p_values_corrected, index=p_values.index).unstack()

In [22]:
metabolome_bacteria_p_values_fdr_correction.hist(bins=10)

In [10]:
(metabolome_bacteria_p_values_fdr_correction < 0.05).sum(axis=0)

In [32]:
print(f"{(metabolome_bacteria_p_values_fdr_correction < 0.05).sum(axis=0)} couples are significant out of {metabolome_bacteria_p_values_fdr_correction.shape[0]} aka {100 * (metabolome_bacteria_p_values_fdr_correction < 0.05).sum(axis=0)/metabolome_bacteria_p_values_fdr_correction.shape } %")

In [14]:
metabolome_bacteria_corr_coefficient = metabolome_bacteria_corr_coefficient.T.stack()
metabolome_bacteria_p_values_fdr_correction = metabolome_bacteria_p_values_fdr_correction.T.stack()

In [15]:
metabolome_bacteria_significant_corr = metabolome_bacteria_corr_coefficient[metabolome_bacteria_p_values_fdr_correction < 0.05]


In [16]:
metabolome_bacteria_significant_corr.sort_values(ascending=False)


In [46]:
import matplotlib.pyplot as plt
plt.rc('xtick', labelsize=10)
top_metabolite = metabolome_bacteria_significant_corr.sort_values(ascending=False).index[0][0]
idx = pd.IndexSlice
metabolome_bacteria_significant_corr.loc[idx[top_metabolite, :]].sort_values(ascending=False).plot.bar(figsize=(4,3), title=top_metabolite)

In [47]:
bottom_metabolite = metabolome_bacteria_significant_corr.sort_values(ascending=False).index[-1][0]
metabolome_bacteria_significant_corr.loc[idx[bottom_metabolite, :]].sort_values(ascending=False).plot.bar(figsize=(4,3), title=top_metabolite)

In [52]:
# Search for significant bacteria:
metabolome_bacteria_significant_corr.groupby(axis=0, level=1).apply(lambda x: x.shape[0]).sort_values(ascending=False).plot.bar(figsize=(8,4), title='Number of metabolites each bacteria significantly effects on')

Indeed we got that Lactococcus is significantly effecting on many metabolites. And this genus is a lactic acid bacteria that known to produce a single product - lactic acid. 

In [55]:
lactoccoccus = 'Lactococcus'
metabolome_bacteria_significant_corr.loc[idx[:, lactoccoccus]].sort_values(ascending=False).plot.bar(figsize=(4,3), title='Lactococcus significant correlation to metabolites')


In [58]:
metabolome_bacteria_significant_corr.loc[idx[:, lactoccoccus]].sort_values(ascending=False).index[0]

HMDB0002103 is 27-Hydroxycholesterol (27-HC) it's a dihydroxy bile acids. The enzyme is critical for the degradation of the steroid side-chain and a genetic deficiency of the enzyme leads to reduced formation of bile acids in humans. Might be linked to our health and cholesterol.

In [59]:
metabolome_bacteria_significant_corr.loc[idx[:, lactoccoccus]].sort_values(ascending=False).index[-1]


In [75]:
# (metabolome_bacteria_p_values_fdr_correction.unstack() < 0.05).astype(int).style.background_gradient(cmap ='viridis').set_properties(**{'font-size': '1px'}) 

In [77]:
metabolome_bacteria_p_values_fdr_correction.unstack().shape

HMDB0002103 is Isonicotinic acid a type of  pyridinecarboxylic acids. 

In [85]:
# from matplotlib import rcParams
# rcParams.keys()

In [87]:
import seaborn as sns
sns.set_theme(rc={'figure.figsize':(8,6), 'xtick.labelsize':5, 'ytick.labelsize':5})
sns.heatmap((metabolome_bacteria_p_values_fdr_correction.unstack() < 0.05).astype(int)).set(title='Heatmap significant or not correlation of Metabolite & Bacteria')

In [92]:
metabolome_bacteria_corr_coefficient.astype(float)

In [95]:
sns.heatmap(metabolome_bacteria_significant_corr.unstack().astype(float)).set(title='Heatmap significant correlation of Metabolite & Bacteria')

In [97]:
metabolome_bacteria_significant_corr.to_pickle('analysis/metabolome_bacteria_significant_corr.pkl')
metabolome_bacteria_corr_coefficient.to_pickle('analysis/metabolome_bacteria_corr_coefficient.pkl')
metabolome_bacteria_p_values_fdr_correction.to_pickle('analysis/metabolome_bacteria_p_values_fdr_correction.pkl')

In [17]:
# TODO: Compare the result to the article result? for the sanity check?