# Model Construction
This notebook constructs sample-specific MCMMs for samples in Validation Study B, to compare predicted probiotic engraftment against experimental data. Data from Dsouza et al. 2022 (DOI: 10.1016/j.chom.2022.03.016)

In [None]:
import micom
import pandas as pd

# Validation Study B

## Collect Metadata

In [None]:
# Read Metadata 
metadata = pd.read_csv('../data/studyB_metadata.csv') 

# Split identifiers into separate columns
metadata[['cohort','subject_id']] = metadata['Cohort;_Subject_ID'].str.split(';', expand = True) 

# Isolate columns of interest
metadata = metadata[['Run','study_timepoint','cohort', 'subject_id']].rename(columns = {'Run':'sample_id'}) 

# Create dict for relevant metadata
subject_dict = metadata.set_index('sample_id')['subject_id'].to_dict()

metadata.head()

## Read Taxonomic Assignments from Kraken2
Collect the species-level read counts, and separate samples into placebo and treatment samples

In [None]:
# Load sequencing data
counts = pd.read_csv('../data/studyB_S_counts.csv').rename(columns = {'name':'species', 'fraction_total_reads':'abundance'})

# Match sample ID format to metadata
counts['sample_id'] = counts['sample_id'].str.replace(r'^S_', '', regex=True) 

# Merge with metadata 
counts = counts.merge(metadata, on='sample_id', how='left')

# Sort table
counts.sort_values(by = ['cohort','study_timepoint'], inplace = True)

# Isolate baseline samples 
counts_baseline = counts[(counts['study_timepoint'] == 'Day 06')&(counts['cohort'] != 'Cohort 6')]

# Rename incorrectly labeled species
counts_baseline['species'] = counts_baseline['species'].str.replace('oxytoca','michiganensis') 
counts_baseline['species'] = counts_baseline['species'].str.replace('Segatella','Prevotella')

# Remove unsed cohorts
counts_baseline = counts_baseline[(counts_baseline.cohort!='Cohort Vanco')&(counts_baseline.cohort!='Cohort Sentinel')]

# Format for MICOM
counts_baseline['id'] = counts_baseline['species'].str.replace(' ', '_') 
counts_baseline = counts_baseline.groupby(['sample_id','species','id','cohort','subject_id','study_timepoint']).sum(numeric_only = True).reset_index()
counts_baseline.drop(columns = 'sample_id', inplace = True)
counts_baseline.rename(columns = {'subject_id': 'sample_id'}, inplace = True)
counts_baseline = counts_baseline[['sample_id','species','id','abundance','cohort']]

# Remove sample without follow-up
counts_baseline = counts_baseline[counts_baseline.sample_id!=' Subject 102']

# Create sample_id to cohort dictionary
cohort_dict = counts_baseline.drop_duplicates(subset = 'sample_id').set_index('sample_id')['cohort'].to_dict()
counts_baseline.sample_id.nunique()

## Add Probiotics to Treatment Arm
Samples in the treatment arm are supplemented with a probiotic cocktail, wbf11

In [None]:
def add_all_probiotic(taxonomy):
    taxonomy_probiotic_total = pd.DataFrame()
    for sample in taxonomy['sample_id'].unique():
        taxonomy_reduced = taxonomy[taxonomy['sample_id'] == sample].copy()
        taxonomy_reduced['abundance'] = taxonomy_reduced['abundance']*(0.2)
        taxonomy_probiotic = pd.concat([taxonomy_reduced,pd.DataFrame({
                  'sample_id':[sample, sample, sample, 
                              sample, sample, sample], 
                  'species': ['Enterocloster bolteae', 'Anaerotruncus colihominis',
                              'Sellimonas intestinalis', '[Clostridium] symbiosum', 
                              'Dorea longicatena', 'Flavonifractor plautii'],
                  'abundance':[80/6]*6, 
                  'id':['Enterocloster_bolteae', 'Anaerotruncus_colihominis',
                              'Sellimonas_intestinalis', 'Clostridium_symbiosum', 
                              'Dorea_longicatena', 'Flavonifractor_plautii'],
                  'cohort':[taxonomy_reduced['cohort'].unique()[0]]*6})])
        taxonomy_probiotic_total = pd.concat([taxonomy_probiotic_total, taxonomy_probiotic])
        taxonomy_probiotic_total = taxonomy_probiotic_total.groupby(['sample_id','id','species','cohort']).sum().reset_index()

    return taxonomy_probiotic_total


counts_probiotic = add_all_probiotic(counts_baseline)

## Build Models
Build MCMMs for samples in the treatment arm and the placebo arm

In [None]:
# Build Models
agora = '../agora201_refseq216_species_1.qza'

# Treatment group models
micom.workflows.build(counts_probiotic, 
                                 model_db=agora, 
                                 out_folder='../VE303_treated',
                                 cutoff=0.001, 
                                 threads=10)