## This notebook shows the full workflow for building models, simulating growth and obtaining SCFA predictions from data collected by the _ex vivo_ study conducted by Gurry et al. 2021 (Study C)

In [None]:
import pandas as pd
import numpy as np 
# import qiime2 as q2
import micom as mm
from micom.viz import plot_tradeoff
from plotnine import *
import os
import sys
from tqdm import tqdm

pd.options.mode.chained_assignment = None  # default='warn'


%matplotlib inline

## First, we pull in the taxonomy table, matching each feature ID in the qiime2 output to a microbial taxa at the species level. We will build our models at the genus level, so collapse to this rank

In [None]:
os.chdir('/proj/gibbons/nbohmann/exvivo/gurry1/data/qiime2/taxonomy/data')
taxa = pd.read_csv('taxonomy.tsv',sep='\t') # read table
taxa.set_index('Feature ID',inplace=True)
taxa = taxa.Taxon.str.split(';',expand=True) # split ranks
taxa = taxa.rename(columns = {0:'Kingdom',1:'Phylum',2:'Class',3:'Order',4:'Family',5:'Genus',6:'Species'})
taxa = taxa.dropna(subset = ['Genus']) # drop undefined columns
taxa = taxa.drop(columns = ['Species']) # drop species column
taxa = taxa.apply(lambda column: column.str.split('_').str[2]) # remove prefixes
taxa = (taxa.apply(lambda row: ";".join(row.str.capitalize().fillna("")), axis=1)
        .to_frame().rename(columns = {0:'taxon'})) # join columns into taxon identifier
taxa

## Next we'll pull in the abundance table, with read counts for all present taxa. We'll drop those that aren't identified in the taxon list, and sum together duplicates. 

In [None]:
os.chdir('/proj/gibbons/nbohmann/exvivo/gurry1/data/qiime2/')
unrarefied_table = q2.Artifact.load('table.qza') # read table
abundance = unrarefied_table.view(pd.DataFrame).reset_index().rename(columns = {'index':'sample_id'})
to_drop = (abundance[abundance.columns[1:]].columns[~abundance[abundance.columns[1:]].
                                                    columns.isin(taxa.index)].to_list()) # taxa to drop
abundance = abundance.drop(columns = to_drop)
abundance = abundance.rename(columns = taxa['taxon'].to_dict())

abundance = pd.melt(abundance, id_vars = 'sample_id', value_vars = abundance.columns[:-1],
                    var_name = 'id', value_name = 'abundance') # melt into long form df 
abundance = abundance.groupby(by = ['sample_id','id']).sum().reset_index()
abundance['genus'] = abundance['id'].str.split(';').str[-1] # need a genus column in df 
abundance

## We also need a model database to pull our reconstructions from 

In [None]:
agora = ('/proj/gibbons/refs/micom_dbs/agora103_genus.qza')

## Now, we'll build our models, with cutoff of 0.001

In [None]:
models = mm.workflows.build(abundance,out_folder = '/proj/gibbons/nbohmann/exvivo/gurry1/micom/16S/16S_models_01/',
                      model_db = agora, cutoff = 0.001, threads = 20)

## Load in the carbon-stripped European Diet, and construct the intervention diets by augmenting with inulin and pectin

In [None]:
os.chdir('/proj/gibbons/nbohmann/exvivo/diets')
medium = pd.read_csv('european_agora_low_carb.csv')
medium['flux'] = medium['flux']*.1
os.chdir('/proj/gibbons/nbohmann/exvivo/gurry1/micom/16S/')
manifest = pd.read_csv('16S_models/manifest.csv')
pectin_medium = pd.concat([medium[~medium.reaction.str.contains('pect')], pd.DataFrame({'reaction':['EX_pect_m'],
                                                                                        'flux':[0.75]})])
inulin_medium = pd.concat([medium[~medium.reaction.str.contains('inulin')], pd.DataFrame({'reaction':['EX_inulin_m'],
                                                                                          'flux':[10.5]})]) 

## Now we'll grow the samples using the respective media we constructed=

In [None]:
os.chdir('/proj/gibbons/nbohmann/exvivo/gurry1/micom/16S/')
ctrl_growth = mm.workflows.grow(manifest, model_folder='16S_models',medium = medium, 
                                tradeoff = 0.7, strategy ='none', threads = 12)
pectin_growth = mm.workflows.grow(manifest, model_folder='16S_models',medium = pectin_medium, 
                                    tradeoff = 0.7, strategy = 'none', threads = 12)
inulin_growth = mm.workflows.grow(manifest, model_folder='16S_models/',medium = inulin_medium, 
                                   tradeoff = 0.7, strategy = 'none', threads = 12)

## This function will get the growth results from each sample, and filter down to butyrate, propionate and acetate.

In [None]:
def get_fluxes(growth, cond):
    growth = growth.exchanges
    growth = growth[(growth.reaction.str.startswith('EX_but(e)'))|(growth.reaction.str.startswith('EX_ppa(e)'))|
                    (growth.reaction.str.startswith('EX_ac(e)'))]
    growth = growth[growth.direction == "export"].groupby(["sample_id", "metabolite", "reaction"]).apply(lambda df: sum(df.flux * df.abundance)).reset_index()
    growth['index'] = growth['sample_id']+'_'+cond
    return growth

## Now we'll concatenate a dataframe with all our growth results

In [None]:
predicted = pd.DataFrame()
predicted = pd.concat([predicted, get_fluxes(ctrl_growth, 'CTRL')])
predicted = pd.concat([predicted, get_fluxes(inulin_growth, 'INUL')])
predicted = pd.concat([predicted, get_fluxes(pectin_growth, 'PECT')])
predicted = pd.pivot_table(predicted, index = 'index',columns = 'reaction', values = 0)
but = predicted['EX_but(e)'].to_dict()
ppa = predicted['EX_ppa(e)'].to_dict()
ac = predicted['EX_ac(e)'].to_dict()

## This function will calculate production rate from the experimental SCFA measurements, as well as standard deviations, and concatenate them into a dataframe

In [None]:
def flux_calculate(arg):
    os.chdir('/proj/gibbons/nbohmann/exvivo/gurry1/data/gc_data/normalized')
    file = pd.read_csv(arg,index_col = 0)
    file = file[['but','pro','ace']].dropna()
    file = file[~file.index.str.contains("QC")]
    file['sample'] = file.index.str.split('-').str[0]
    file['treatment'] = file.index.str.split('-').str[1]
    file['timepoint'] = file.index.str.split('-').str[2]
    file['replicate'] = file.index.str.split('-').str[4]
    file = file.dropna()
    baseline = file[file.timepoint.str.contains('0')]
    baseline['treatment'] = 'INUL'
    file = pd.concat([file,baseline])
    baseline['treatment'] = 'PECT'
    file = pd.concat([file,baseline])
    file = file[(file.index.str.contains('CTRL'))|(file.index.str.contains('PECT'))
                |(file.index.str.contains('INUL'))]
    file = file[~file.timepoint.str.contains('0')]
    file = file.sort_values(by=['sample','treatment','replicate','timepoint'])
    file.set_index(['sample','treatment','replicate','timepoint'],inplace = True)
    file = file.groupby(['sample','treatment','replicate']).diff().dropna().reset_index()
    stdev = file.groupby(['sample','treatment']).std(numeric_only = True).reset_index().set_index('treatment')
    file = file.groupby(['sample','treatment']).mean(numeric_only = True).reset_index()
    file['but_dev'] = file['treatment'].map(stdev['but'].to_dict())
    file['ppa_dev'] = file['treatment'].map(stdev['pro'].to_dict())
    file['ace_dev'] = file['treatment'].map(stdev['ace'].to_dict())

    return file

## Now we can merge our predictions with the measured production rates for comparison

In [None]:
sample_list = ['H008-a.csv','H009-a.csv','H010-a.csv','H012-a.csv','H019-a.csv',
               'H020-a.csv','H021-a.csv','H025-a.csv','H028-a.csv','H029-a.csv']
os.chdir('/proj/gibbons/nbohmann/exvivo/gurry1/data/gc_data/normalized')
flux = pd.DataFrame([])
for x in tqdm(sample_list):
    flux = pd.concat([flux,flux_calculate(x)])
flux.reset_index(inplace = True,drop = True)
flux['index'] = flux['sample']+'_'+flux['treatment']
flux.set_index('index',inplace = True)
flux['predicted_but'] = flux.index.map(but)
flux['predicted_ac'] = flux.index.map(ac)
flux['predicted_ppa'] = flux.index.map(ppa)
flux = flux.dropna()
flux['treatment'] = flux['treatment'].str.replace('CTRL','Control')
flux['treatment'] = flux['treatment'].str.replace('INUL','Inulin')
flux['treatment'] = flux['treatment'].str.replace('PECT','Pectin')

## Finally, we'll plot predicted vs measured fluxes against each other

In [None]:
plt1=(
    ggplot(
        flux[~flux.treatment.str.contains('FOS')],aes(x='pro',y='predicted_ppa'))
        +geom_point(aes(color='treatment'),size=5)
        +geom_smooth(method='lm',linetype='--')
        +geom_errorbarh(aes(y ="predicted_ppa", xmin = flux['pro'] - flux['ppa_dev'],
                            xmax=flux['pro'] + flux['ppa_dev']))
        +scale_color_manual(values = ['deepskyblue','darksalmon', 'yellowgreen'])
        +labs(title='Propionate',x='Measured($\dfrac{mmol}{L*h}$)',
              y = 'Predicted ($\dfrac{mmol}{gDCW*h}$)')
        +theme(text = element_text(size=20),panel_background=element_rect(fill = "white",
                                colour = "white",size = 0.5, linetype = "solid"),
                                panel_grid=element_line(size = .2, linetype = "solid",colour = "gray"),
                                axis_line = element_line(size = 2, linetype = "solid",colour = "black"),
                                legend_title=element_blank(),
                                legend_position='right'))
plt1

In [None]:
plt2=(
    ggplot(
        flux[~flux.treatment.str.contains('FOS')],aes(x='but',y='predicted_but'))
        +geom_point(aes(color='treatment'),size=5)
        +geom_smooth(method='lm',linetype='--')
        +scale_color_manual(values = ['deepskyblue','darksalmon', 'yellowgreen'])
        +geom_errorbarh(aes(y ="predicted_but", xmin = flux['but'] - flux['but_dev'],
                            xmax=flux['but'] + flux['but_dev']))
        +labs(title='Butyrate',x='Measured ($\dfrac{mmol}{L*h}$)',
              y = 'Predicted ($\dfrac{mmol}{gDCW*h}$)')
        +theme(text = element_text(size=20),panel_background=element_rect(fill = "white",
                                colour = "white",size = 0.5, linetype = "solid"),
                                panel_grid=element_line(size = .2, linetype = "solid",colour = "gray"),
                                axis_line = element_line(size = 2, linetype = "solid",colour = "black"),
                                legend_title=element_blank(),
                                legend_position='right'))
plt2

In [None]:
plt3=(
    ggplot(
        flux[~flux.treatment.str.contains('FOS')],aes(x='ace',y='predicted_ac'))
        +geom_point(aes(color='treatment'),size=5)
        +geom_smooth(method='lm',linetype='--')
        +scale_color_manual(values = ['deepskyblue','darksalmon', 'yellowgreen'])
        +geom_errorbarh(aes(y ="predicted_ac", xmin = flux['ace'] - flux['ace_dev'],
                            xmax=flux['ace'] + flux['ace_dev']))
        +labs(title='Acetate',x='Measured ($\dfrac{mmol}{L*h}$)',
              y = 'Predicted ($\dfrac{mmol}{gDCW*h}$)')
        +theme(text = element_text(size=20),panel_background=element_rect(fill = "white",
                                colour = "white",size = 0.5, linetype = "solid"),
                                panel_grid=element_line(size = .2, linetype = "solid",colour = "gray"),
                                axis_line = element_line(size = 2, linetype = "solid",colour = "black"),
                                legend_title=element_blank(),
                                legend_position='right'))
plt3

## Save all results

In [None]:
flux.to_csv('/proj/gibbons/nbohmann/exvivo/scfa_paper/gurry1.csv')