# Wastyk et al. (2021) from the Sonnenburg Lab showed in a feeding study the inter-individual differences in inflammatory immune response after intervention with a high-fiber diet. Here, we use microbial sequencing data from this study to compare the results of MICOM metabolic modeling with the results obtained by the authors. Specifically, we are curious whether the SCFA production profiles from our metabolic models agree with the inflammatory immune response group used by the authors to categorize study participants

In [1]:
import pyreadr
import pandas as pd
import micom
# import qiime2 as q2
from plotnine import *
import seaborn as sns
import numpy as np
import scipy.stats
from sklearn.cluster import AgglomerativeClustering

import os

%matplotlib inline

## Get to our directory and pull in a dictionary with the inflammation groups

In [2]:
os.chdir('/proj/gibbons/nbohmann/exvivo/sonnenburg_2021/')
groups = (pd.read_csv('fiber_immune_groups.csv',index_col = 1)['Immune_group'].str.replace('group1','HighInflammation')
          .str.replace('group2','LowInflammationI').str.replace('group3','LowInflammationII').astype('str').to_dict())
groups

{8007: 'HighInflammation',
 8009: 'HighInflammation',
 8013: 'HighInflammation',
 8001: 'HighInflammation',
 8002: 'HighInflammation',
 8023: 'LowInflammationI',
 8029: 'LowInflammationI',
 8017: 'LowInflammationI',
 8041: 'LowInflammationI',
 8022: 'LowInflammationI',
 8003: 'LowInflammationI',
 8006: 'LowInflammationI',
 8039: 'LowInflammationII',
 8018: 'LowInflammationII',
 8036: 'LowInflammationII',
 8037: 'LowInflammationII',
 8038: 'LowInflammationII',
 8035: 'LowInflammationII'}

## Pull in their abundance tables, and filter to start and end timepoints. Convert table to use in MICOM

In [3]:
os.chdir('/proj/gibbons/nbohmann/exvivo/sonnenburg_2021/')
abundance = pyreadr.read_r('tip_glom_counts.rds')[None]
abundance = abundance[(abundance['Participant'].isin(groups.keys()))] #only use individuals in the fiber study
abundance['Participant'] = abundance['Participant'].astype('str') 
abundance['sample_id'] = abundance['Participant']+'_'+abundance['Timepoint'] #we'll include timepoint in the index so we don't lose that info
abundance.set_index('sample_id',inplace = True,drop = True)
abundance = abundance.drop(columns = ['Participant','Group','Group_value','Timepoint']) #need only ASV counts
abundance

Unnamed: 0_level_0,ASV_1,ASV_10,ASV_100,ASV_101,ASV_102,ASV_103,ASV_104,ASV_105,ASV_106,ASV_107,...,ASV_90,ASV_91,ASV_92,ASV_93,ASV_94,ASV_95,ASV_96,ASV_97,ASV_98,ASV_99
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8001_1,0.000000,0.000000,0.0,0.0,0.060921,0.000000,0.0,0.000000,0.000000,0.003467,...,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000
8001_2,0.000424,0.000562,0.0,0.0,0.033276,0.000131,0.0,0.000944,0.000018,0.000000,...,0.0,0.000367,0.0,0.0,0.000012,0.000208,0.0,0.0,0.0,0.000249
8001_3,0.000000,0.000000,0.0,0.0,0.116271,0.000000,0.0,0.000000,0.000000,0.005932,...,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000
8001_4,0.000000,0.000000,0.0,0.0,0.091430,0.000000,0.0,0.000000,0.000000,0.002735,...,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000
8001_5,0.000000,0.000000,0.0,0.0,0.081136,0.000000,0.0,0.000000,0.000000,0.002343,...,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8041_5,0.002461,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000
8041_6,0.003770,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000
8041_7,0.001856,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000
8041_8,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000


## Also pull in their taxonomy table, assinging reads to taxa. We'll use this to genus depth, and convert to use in MICOM

In [4]:
taxa = pyreadr.read_r('rep_ASV_labels.rds')[None].set_index('rep_ASV_label').drop(columns = ['rep_ASV','Species'])
taxa.index.rename('Feature ID',inplace = True) 
for col in taxa.columns:  #convert so it can be read by AGORA 
    taxa[col] = taxa[col].str.split('_').str[-1]
taxa['Genus'].replace('', np.nan, inplace = True)
taxa.dropna(how = 'any',inplace = True)
taxa = taxa.apply(lambda row: ";".join(row.str.capitalize().fillna("")), axis=1).to_frame() #into MICOM accepted format
taxa.columns = ["Taxon"]
abundance = abundance[taxa.index] #we only want ASVs in the abundance table that are present in the taxonomy table
taxa = taxa['Taxon'].to_dict()
abundance = abundance.rename(columns=taxa).reset_index()
abundance = pd.melt(abundance, id_vars = 'sample_id', value_vars = abundance.columns[:-1],
                    var_name = 'id', value_name = 'abundance') # melt into long form df 
abundance = abundance.groupby(by = ['sample_id','id']).sum().reset_index()
abundance['genus'] = abundance['id'].str.split(';').str[-1] # need a genus column in df 
abundance

Unnamed: 0,sample_id,id,abundance,genus
0,8001_1,Archaea;Euryarchaeota;Methanobacteria;Methanob...,0.0,Methanobrevibacter
1,8001_1,Archaea;Euryarchaeota;Methanobacteria;Methanob...,0.0,Methanosphaera
2,8001_1,Archaea;Euryarchaeota;Thermoplasmata;E2;[metha...,0.0,Methanomassiliicoccus
3,8001_1,Archaea;Euryarchaeota;Thermoplasmata;E2;[metha...,0.0,Vadinca11
4,8001_1,Bacteria;Actinobacteria;Actinobacteria;Actinom...,0.0,Actinomyces
...,...,...,...,...
15223,8041_9,Bacteria;Proteobacteria;Gammaproteobacteria;En...,0.0,Escherichia
15224,8041_9,Bacteria;Proteobacteria;Gammaproteobacteria;En...,0.0,Proteus
15225,8041_9,Bacteria;Spirochaetes;[brachyspirae];[brachysp...,0.0,Brachyspira
15226,8041_9,Bacteria;Synergistetes;Synergistia;Synergistal...,0.0,Pyramidobacter


## Before building our models we need to define our model database

In [5]:
agora = ('/proj/gibbons/refs/micom_dbs/agora103_genus.qza')

## Now we'll build the models using our abundance table. 

In [None]:
models = mm.workflows.build(abundance,out_folder = '/proj/gibbons/nbohmann/exvivo/sonnenburg_2021/models/',
                      model_db = agora, cutoff = 0.001, threads = 20)

## Let's try growing all samples at all timepoints on a high fiber diet, and seeing if there is a trend in SCFA production during intervention. First we'll look at the fiber consumption profile, and scale the diets by that value.

In [10]:
os.chdir('/proj/gibbons/nbohmann/exvivo/sonnenburg_2021/')
manifest = pd.read_csv('models/data/manifest.csv')
os.chdir('/proj/gibbons/nbohmann/exvivo/sonnenburg_2021/')
hf_medium = pd.read_csv('/proj/gibbons/nbohmann/exvivo/diets/highfiber.tsv',sep = '\t').rename(
    columns = {'Reaction':'reaction','Flux Value':'flux'})
growth_hf = micom.workflows.grow(manifest, model_folder='models/data',
                                 medium = hf_medium, tradeoff = 0.7, strategy = 'none',threads = 10,presolve = True)

Output()



## Now we'll collect all the exchanges resulting from the growth of our models

In [None]:
exchanges = growth_hf.exchanges
exchanges['timepoint'] = exchanges['sample_id'].str.split('_').str[1]
exchanges['sample_id'] = exchanges['sample_id'].str.split('_').str[0]
exchanges = (exchanges[exchanges.direction == "export"].groupby( #get total production
    ["sample_id","metabolite", "reaction","timepoint"]).apply(lambda df:  
    sum(df.flux * df.abundance)).reset_index())
exchanges

## Build a dataframe with all the SCFA predictions, removing timepoints 8 and 9 as these were after the study

In [None]:
 scfa = (exchanges[
    (exchanges['reaction'].str.startswith('EX_but(e)'))|
    (exchanges['reaction'].str.startswith('EX_ppa(e)'))|
    (exchanges['reaction'].str.startswith('EX_ac(e)'))]
         .rename(columns = {0:'flux'}))
scfa['timepoint'] = scfa['timepoint'].astype('str')
scfa['group'] = scfa['sample_id'].astype('int').map(groups)
scfa = pd.pivot_table(scfa,index = ['sample_id','timepoint'], columns = 'metabolite', values = 'flux').reset_index()
scfa['total'] = scfa['but[e]']+scfa['ppa[e]']
scfa['group'] = scfa['sample_id'].astype('int').map(groups)
scfa = scfa[~(scfa['timepoint'].str.contains('8'))& ~(scfa['timepoint'].str.contains('9'))]
scfa['timepoint'] = scfa['timepoint'].astype('int')
scfa

## Now, plot the predcited SCFA production among each inflammation group, at each timepoint across the intervention and average. 

In [None]:
plt1 = (ggplot(scfa,
    aes(x = 'timepoint', y = 'ac[e]'))
    +geom_point(color = 'cadetblue')
    +geom_smooth(method = 'lm')
    +labs(x = 'Timepoint', y = 'Predicted Acetate \n ($\dfrac{mmol}{gDW*h}$)')
    +facet_wrap(facets = 'group')
    +theme(text = element_text(size=20),panel_background=element_rect(fill = "white",
                                colour = "white",size = 0.5, linetype = "solid"),panel_grid= element_blank(),
                                axis_line = element_line(size = 2, linetype = "solid",colour = "black"),
                                axis_line_y = element_blank(), axis_text_x=element_text(rotation = 20, hjust = 1)))
plt1

In [None]:
plt2 = (ggplot(scfa,
    aes(x = 'timepoint', y = 'total'))
    +geom_point(color = 'cadetblue')
    +geom_smooth(method = 'lm')
    +labs(x = 'Timepoint', y = 'Predicted Butyrate+Propionate \n ($\dfrac{mmol}{gDW*h}$)')
    +facet_wrap(facets = 'group')
    +theme(text = element_text(size=20),panel_background=element_rect(fill = "white",
                                colour = "white",size = 0.5, linetype = "solid"),panel_grid= element_blank(),
                                axis_line = element_line(size = 2, linetype = "solid",colour = "black"),
                                axis_line_y = element_blank(), axis_text_x=element_text(rotation = 20, hjust = 1)))
plt2

In [None]:
plt3 = (ggplot(
        scfa[(scfa.timepoint!='8')&(scfa.timepoint!='9')], aes(x = 'group', y = 'ac[e]'))
        +geom_boxplot(fill = 'cadetblue',size = 1, width = 0.4)
        +labs(x = 'Immune Response Group', y = 'Predicted Acetate ($\dfrac{mmol}{L*h}$)')
        +scale_x_discrete(limits = ['HighInflammation','LowInflammationI','LowInflammationII'],
                          labels = ['High \n Inf.', 'Low \n Inf. I', 'Low \n Inf. II'])
        +theme(text = element_text(size=20, color = 'black'),panel_background=element_rect(fill = "white",
                                colour = "white",size = 0.5, linetype = "solid"),panel_grid= element_blank(),
                                axis_line = element_line(size = 2, linetype = "solid",colour = "black"),
                                legend_position='right',axis_text_x=element_text(rotation = 20, hjust = 1)))
plt

In [None]:
plt4 = (ggplot(
        scfa[(scfa.timepoint!='8')&(scfa.timepoint!='9')], aes(x = 'group', y = 'total[e]'))
        +geom_boxplot(fill = 'cadetblue',size = 1, width = 0.4)
        +labs(x = 'Immune Response Group', y = 'Predicted Butyrate + Propionate($\dfrac{mmol}{L*h}$)')
        +scale_x_discrete(limits = ['HighInflammation','LowInflammationI','LowInflammationII'],
                          labels = ['High \n Inf.', 'Low \n Inf. I', 'Low \n Inf. II'])
        +theme(text = element_text(size=20, color = 'black'),panel_background=element_rect(fill = "white",
                                colour = "white",size = 0.5, linetype = "solid"),panel_grid= element_blank(),
                                axis_line = element_line(size = 2, linetype = "solid",colour = "black"),
                                legend_position='right',axis_text_x=element_text(rotation = 20, hjust = 1)))
plt4