## Here we'll try to replicate the results of Venturelli et al, measuring butyrate production in synthetically constructed _in vitro_ communities

In [None]:
import os
import pandas as pd
from plotnine import *
from tqdm import tqdm
import sklearn
import numpy as np
import scipy
import micom
import warnings
warnings.simplefilter(action='ignore')
%matplotlib inline

## Function for building our taxonomies

In [None]:
def build_all(taxa):
    taxonomy = pd.DataFrame()
    for x in tqdm(taxa.index):
        sample = taxa.loc[x] #find the row with abundances 
        sample = sample.index.to_list() #only want those taxa with abundance of 1% or more
        names = ",".join([x[:2] for x in sample]).split(',') #get taxa name
        file = list(map(genera.get,names))#map to AGORA db dictionary 
        os.chdir('/proj/gibbons/nbohmann/exvivo/databases/agora103_genus/data/')
        sample_taxa = pd.DataFrame({ 
            "id": names,
            "abundance":taxa.loc[x],
            "file": file,
            "sample_id":x})
        sample_taxa = sample_taxa.dropna(how = 'any')
        sample_taxa['file'] = '/proj/gibbons/nbohmann/exvivo/databases/agora103_genus/data/'+sample_taxa['file']
        taxonomy = taxonomy.append(sample_taxa, ignore_index = True)
    return taxonomy

## Let's start by initializing our dictionary of genera abbreviations

In [None]:
genera = {'PC':'Prevotella.json',
          'PJ':'Parabacteroides.json',
          'BV':'Bacteroides.json',
          'BF':'Bacteroides.json',
          'BO':'Bacteroides.json',
          'BT':'Bacteroides.json',
          'BC':'Bacteroides.json',
          'BY':'Bacteroides.json',
          'BU':'Bacteroides.json',
          'DP':'Desulfovibrio.json',
          'BL':'Bifidobacterium.json',
          'BA':'Bifidobacterium.json',
          'BP':'Bifidobacterium.json',
          'CA':'Collinsella.json',
          'EL':'Eggerthella.json',
          'FP':'Faecalibacterium.json',
          'CH':'Clostridium.json',
          'AC':'Anaerostipes.json',
          'BH':'Blautia.json',
          'CG':'Clostridium.json',
          'ER':'Eubacterium.json',
          'RI':'Roseburia.json',
          'CC':'Coprococcus.json',
          'DL':'Dorea.json',
          'DF':'Dorea.json'}

## Let's pull up the primary dataframe with abundances, ODs, and SCFAs. We'll add a column with richness measures. 

In [None]:
os.chdir('/proj/gibbons/nbohmann/exvivo/venturelli_2021/')
main = pd.read_csv('masterDF.csv',index_col=0) #get the dataframe
main = main.drop(columns = 'HB')
main[main.columns[11:37]] = main[main.columns[11:37]].fillna(0).astype('int')
main['richness'] = main[main.columns[11:37]].sum(axis = 1)
main['Plate'] = main['Plate'].astype('str').str.split('.').str[0].apply(lambda x: x.zfill(2))
main['Column'] = main['Column'].astype('str').str.split('.').str[0].apply(lambda x: x.zfill(2))
main['Run'] = main['Sequencing Run'].str[-3:]
main['sample_id'] = 'P'+main['Plate']+main['Row']+main['Column']+'_'+main['Run']
main = main[main['Contamination?']=='No'] #no contaminants
main.set_index('sample_id',inplace = True)
main = main[~main.index.duplicated(keep = 'last')]

## Lets also pull up the componentized diet dataframe, adding iron as we found previously this is required for growth!

In [None]:
os.chdir('/proj/gibbons/nbohmann/exvivo/venturelli_2021/')
medium = pd.read_excel('DM38_components.xlsx') #load the df
medium = medium.rename(columns={'Component':'reaction','Concentration (mM)':'flux'}) #easy renaming
medium.reaction = 'EX_' + medium.reaction + '_m' #reaction column
medium = medium.append({'reaction':'EX_fe3_m','flux':0.50},ignore_index=True) #higher iron conc. seems to be necessary for growth
medium['index'] = medium['reaction']
medium.set_index('index',inplace = True)

## Now we'll start with the low richness (1-5 species) communities

In [None]:
low = main[(main['richness'].astype('int') <=10)] #lets focus on 1-5 species communities. 
taxa_low = [cols for cols in low.columns if 'Fraction' in cols] #df with the taxa present in each community
taxa_low = low[taxa_low].drop(columns = ['B.cereus Fraction'])
taxa_low = taxa_low.round(4).dropna(how='all')
OD_low = low['OD'].to_dict()
meas_but_low = low['Butyrate'].to_dict()
meas_ac_low = low['Acetate'].to_dict()
low

### Build the models for low richness communities

In [None]:
os.chdir('/proj/gibbons/nbohmann/exvivo/venturelli_2021/')
taxonomy = build_all(taxa_low)
manifest_low = micom.workflows.build(taxonomy, out_folder='/proj/gibbons/nbohmann/exvivo/venturelli_2021/low_richness',
                                  model_db = None, cutoff=0.001, threads=10)

### Grow the models for low richness communities

In [None]:
manifest_low = pd.read_csv('/proj/gibbons/nbohmann/exvivo/venturelli_2021/low_richness/manifest.csv')
growth = micom.workflows.grow(manifest_low,'/proj/gibbons/nbohmann/exvivo/venturelli_2021/low_richness',medium,
                           tradeoff = 0.7, threads = 10)
exchanges = growth.exchanges
exchanges = (exchanges[exchanges.direction == "export"].groupby(["sample_id","metabolite", "reaction"])
         .apply(lambda df: sum(df.flux * df.abundance)).reset_index())
but_low = exchanges[exchanges.reaction.str.startswith('EX_but(e)')].rename(columns = {0:'predicted'})
but_low['OD'] = but_low['sample_id'].map(OD_low)
but_low['measured'] = but_low['sample_id'].map(meas_but_low)/but_low['OD']
but_low = but_low[but_low['measured']>=0.0]
but_low = but_low[but_low['measured']<=100.0]

### Plot results for low richness communities

In [None]:
plt = ( #plot scaled measured value vs predicted value for butyrate flux
    ggplot(
    but_low, aes(x = 'measured',y = 'predicted'))
    +geom_point(size = 3, color = 'cadetblue')
    +geom_smooth(method = 'lm', color = 'cadetblue',linetype =  '--')
    
    +labs(x = 'Measured Production Rate ($\dfrac{mmol}{h}$)', y = 'Predicted Production Rate ($\dfrac{mmol}{gDW*h}$)')
    +theme(text = element_text(size=15),panel_background=element_rect(fill = "white",
                                colour = "white",size = 0.5, linetype = "solid"),
                                panel_grid=element_blank(),
                                axis_line = element_line(size = 2, linetype = "solid",colour = "black"),
                                legend_title=element_blank(),
                                legend_position='right'))
plt

In [None]:
rho, p = scipy.stats.pearsonr(but_low['measured'], but_low['predicted'])
p

## Now, high richness models

In [None]:
high = main[(main['richness'].astype('int')>=10)] #lets focus on 3-5 species communities. 
taxa_high = [cols for cols in high.columns if 'Fraction' in cols] #df with the taxa present in each community
taxa_high = high[taxa_high].drop(columns = ['B.cereus Fraction'])
taxa_high = taxa_high.round(4).dropna(how='all')
OD_high = high['OD'].to_dict()
meas_but_high = high['Butyrate'].to_dict()
meas_ac_high = high['Acetate'].to_dict()

### Build the models for high richness communities

In [None]:
os.chdir('/proj/gibbons/nbohmann/exvivo/venturelli_2021/')
taxonomy = build_all(taxa_high)
manifest_high = micom.workflows.build(taxonomy, out_folder="/proj/gibbons/nbohmann/exvivo/venturelli_2021/high_richness",
                                  model_db = None, cutoff=0.001, threads=20)

### Grow the models for high richness communities

In [None]:
os.chdir('/proj/gibbons/nbohmann/exvivo/venturelli_2021/')
manifest_high = pd.read_csv('/proj/gibbons/nbohmann/exvivo/venturelli_2021/high_richness/manifest.csv')
growth = micom.workflows.grow(manifest_high,'/proj/gibbons/nbohmann/exvivo/venturelli_2021/high_richness',medium,
                           tradeoff = 0.7, threads = 10)
exchanges = growth.exchanges
exchanges = (exchanges[exchanges.direction == "export"].groupby(["sample_id","metabolite", "reaction"])
         .apply(lambda df: sum(df.flux * df.abundance)).reset_index())
but_high = exchanges[exchanges.reaction.str.startswith('EX_but(e)')].rename(columns = {0:'predicted'})
but_high['OD'] = but_high['sample_id'].map(OD_high)
but_high['measured'] = but_high['sample_id'].map(meas_but_high)/but_high['OD']

### Plot High Richness Results

In [None]:
plt = ( #plot scaled measured value vs predicted value for butyrate flux
    ggplot(
    but_high, aes(x = 'measured',y = 'predicted'))
    +geom_point(size = 3, color = "cadetblue")
    +geom_smooth(method = 'lm', color = "cadetblue", linetype = '--')
    +labs(x = 'Measured Production Rate ($\dfrac{mmol}{h}$)', y = 'Predicted Production Rate ($\dfrac{mmol}{gDW*h}$)')
    +theme(text = element_text(size=20),panel_background=element_rect(fill = "white",
                                colour = "white",size = 0.5, linetype = "solid"),panel_grid= element_blank(),
                                axis_line = element_line(size = 2, linetype = "solid",colour = "black"),
                                legend_title=element_blank(),
                                legend_position='right'))
plt

In [None]:
rho, p = scipy.stats.pearsonr(but_high['measured'], but_high['predicted'])
p

## Save results

In [None]:
but_low.to_csv('/proj/gibbons/nbohmann/exvivo/scfa_paper/venturelli_small_g.csv')
but_high.to_csv('/proj/gibbons/nbohmann/exvivo/scfa_paper/venturelli_big_g.csv')