## Here we'll try to replicate the results of Venturelli et al, measuring butyrate production in synthetically constructed _in vitro_ communities

In [None]:
import os
import pandas as pd
from plotnine import *
import numpy as np
import scipy
import micom
import micom.measures

%matplotlib inline

## Taxonomy Building Function
This function will build a taxonomy table for each community from the dataframe

In [None]:
def build_all(taxa):
    taxonomy = pd.DataFrame()
    for sample_id in taxa.index:
        sample = taxa.loc[sample_id] #find the row with abundances 
        sample = sample.index.to_list() #only want those taxa with abundance of 1% or more
        names = ",".join([sample_id[:2] for sample_id in sample]).split(',') #get taxa name
        genus = list(map(genera.get,names))#map to AGORA db dictionary 
        sample_taxa = pd.DataFrame({ 
            "id": names,
            "abundance":taxa.loc[sample_id],
            "genus": genus,
            "sample_id":sample_id})
        sample_taxa = sample_taxa.dropna(how = 'any')
        taxonomy = pd.concat([taxonomy,sample_taxa])
    return taxonomy

## Genera Abbreviations

In [None]:
genera = {'PC':'Prevotella',
          'PJ':'Parabacteroides',
          'BV':'Bacteroides',
          'BF':'Bacteroides',
          'BO':'Bacteroides',
          'BT':'Bacteroides',
          'BC':'Bacteroides',
          'BY':'Bacteroides',
          'BU':'Bacteroides',
          'DP':'Desulfovibrio',
          'BL':'Bifidobacterium',
          'BA':'Bifidobacterium',
          'BP':'Bifidobacterium',
          'CA':'Collinsella',
          'EL':'Eggerthella',
          'FP':'Faecalibacterium',
          'CH':'Clostridium',
          'AC':'Anaerostipes',
          'BH':'Blautia',
          'CG':'Clostridium',
          'ER':'Eubacterium',
          'RI':'Roseburia',
          'CC':'Coprococcus',
          'DL':'Dorea',
          'DF':'Dorea'}

## Experimental Data
Let's pull up the primary dataframe with abundances, ODs, and SCFAs. We'll add a column with richness measures. 

In [None]:
main = pd.read_csv('../data/invitro_original.csv',index_col=1) # get the dataframe
main[main.columns[11:37]] = main[main.columns[11:37]].fillna(0).astype('int') # convert abudances to int
main['richness'] = main[main.columns[11:37]].sum(axis = 1) # calculate sample richness
main['Plate'] = main['Plate'].astype('str').str.split('.').str[0].apply(lambda x: x.zfill(2)) # add column with plate number
main['Column'] = main['Column'].astype('str').str.split('.').str[0].apply(lambda x: x.zfill(2)) # add column with column number 
main['Run'] = main['Sequencing Run'].str[-3:] # identify sequencing run 
main['sample_id'] = 'P'+main['Plate']+main['Row']+main['Column']+'_'+main['Run'] # create unique identifier for each sample
main = main[main['Contamination?']=='No'] #no contaminants
main.drop_duplicates(subset = 'sample_id', inplace = True)
main.set_index('sample_id',inplace = True) 
main

## Medium 
Pull up the componentized medium from the manuscript used in experiments

In [None]:
medium = pd.read_csv('../media/DM38.csv')
medium

# Low Richness Communities
We'll start with low richness communities, and build taxonomy tables for these communities. 

In [None]:
low = main[(main['richness'].astype('int') <=5)] #lets focus on 1-5 species communities. 
taxaLow = [cols for cols in low.columns if 'Fraction' in cols] #frame with the taxa present in each community
taxaLow = low[taxaLow].drop(columns = ['B.cereus Fraction'])
taxaLow = taxaLow.round(4).dropna(how='all')
odLow = low['OD'].to_dict() #collect dictionary of OD measures
measButLow = low['Butyrate'].to_dict() #collect dictionary of measured butyrate
taxaLow

## Build Models
Now, we'll build our models, with cutoff of 0.001

In [None]:
taxonomy = build_all(taxaLow)
manifestLow = micom.workflows.build(taxonomy, 
                                    out_folder='../models/invitro/low_richness',
                                    model_db = '../agora/data', 
                                    cutoff=0.001,
                                    threads=10)

## Grow Models
Now we'll grow the samples using the respective media we constructed

In [None]:
manifestLow = pd.read_csv('../models/invitro/low_richness/manifest.csv')
growthLow = micom.workflows.grow(manifestLow,
                              '../models/invitro/low_richness',
                              medium,
                              tradeoff = 0.7,
                              threads = 10)

## Compare SCFA Fluxes
We can now extract the production fluxes of SCFAs from each growth simulation, and construct a dataframe with both measured and predicted production rates.

In [None]:
productionLow = micom.measures.production_rates(growthLow)
butLow = productionLow[productionLow.name == 'butyrate'].rename(columns = {'flux':'predicted'})
butLow['OD'] = butLow['sample_id'].map(odLow)
butLow['measured'] = butLow['sample_id'].map(measButLow)/butLow['OD']
butLow = butLow[butLow['measured']>=0.0] #Remove artifacts caused by small or negative OD
butLow = butLow[butLow['measured']<=100.0]
butLow

# High Richness
Now, high richness communities

In [None]:
high = main[(main['richness'].astype('int')>=10)] #lets focus on 3-5 species communities. 
taxaHigh = [cols for cols in high.columns if 'Fraction' in cols] #df with the taxa present in each community
taxaHigh = high[taxaHigh].drop(columns = ['B.cereus Fraction'])
taxaHigh = taxaHigh.round(4).dropna(how='all')
odHigh = high['OD'].to_dict()
measButHigh = high['Butyrate'].to_dict()

## Build
Now, we'll build our models, with cutoff of 0.001

In [None]:
taxonomy = build_all(taxaHigh)
manifestLow = micom.workflows.build(taxonomy, 
                                    out_folder='../models/invitro/high_richness,
                                    model_db = '../agora/data', 
                                    cutoff=0.001,
                                    threads=10)

## Grow
Now we'll grow the samples using the respective media we constructed

In [None]:
manifestHigh = pd.read_csv('../models/invitro/high_richness/manifest.csv')
growthHigh = micom.workflows.grow(manifestLow,
                              '../models/invitro/high_richness',
                              medium,
                              tradeoff = 0.7,
                              threads = 10)

## Compare SCFA Fluxes
We can now extract the production fluxes of SCFAs from each growth simulation, and construct a dataframe with both measured and predicted production rates.

In [None]:
productionHigh = micom.measures.production_rates(growthHigh)
butHigh = productionHigh[productionHigh.name == 'butyrate'].rename(columns = {'flux':'predicted'})
butHigh['OD'] = butHigh['sample_id'].map(odHigh)
butHigh['measured'] = butHigh['sample_id'].map(measButHigh)/butHigh['OD']

## Save results

In [None]:
butLow['Richness'] = 'Low (1-5 taxa)'
butHigh['Richness'] = 'High (10-25 taxa)'
butyrateTot = pd.concat([butLow, butHigh])
butyrateTot['Richness'] = butyrateTot['Richness'].astype('category')
butyrateTot['Richness'] = butyrateTot['Richness'].cat.reorder_categories(['Low Richness (1-5 taxa)','High Richness (10-25 taxa)'])
butyrateTot.to_csv('../results/invitro.csv')

## Plot Results
Plot the results of butyrate predictions in both contexts. (Fig 2)

In [None]:
plot = ( #plot scaled measured value vs predicted value for butyrate flux
    ggplot(
    butyrateTot, aes(x = 'measured',y = 'predicted'))
    +geom_point(size = 3, color = "cornflowerblue")
    +geom_smooth(method = 'lm', linetype = '--')
    +labs(x = 'Measured, mmol/OD/h', y = 'Predicted, mmol/gDW/h', title = 'Butyrate Production')
    +facet_wrap('Richness', scales = 'free')
    +theme(figure_size = (10,5),
           text = element_text(size=20),panel_background=element_rect(fill = "white",
                                colour = "white",size = 0.5, linetype = "solid"),panel_grid= element_blank(),
                                axis_line = element_line(size = 2, linetype = "solid",colour = "black"),
                                legend_title=element_blank(),
                                legend_position='right'))
plot