# Model Construction
This notebook constructs sample-specific MCMMs for samples in Validation Study B, to compare predicted probiotic engraftment against experimental data. Data from Dsouza et al. 2022 (DOI: 10.1016/j.chom.2022.03.016)

In [5]:
import micom
import pandas as pd

# Validation Study B

## Collect Metadata

In [6]:
# Read Metadata 
metadata = pd.read_csv('../data/studyB_metadata.csv') 

# Split identifiers into separate columns
metadata[['cohort','subject_id']] = metadata['Cohort;_Subject_ID'].str.split(';', expand = True) 

# Isolate columns of interest
metadata = metadata[['Run','study_timepoint','cohort', 'subject_id']].rename(columns = {'Run':'sample_id'}) 

# Create dict for relevant metadata
subject_dict = metadata.set_index('sample_id')['subject_id'].to_dict()

metadata.head()

Unnamed: 0,sample_id,study_timepoint,cohort,subject_id
0,SRR15520622,Week 6,Cohort 6,Subject 194
1,SRR15520623,Week 6,Cohort 6,Subject 192
2,SRR15520624,Screening,Cohort Vanco,Subject 64
3,SRR15520625,Week 6,Cohort 6,Subject 191
4,SRR15520626,Week 12,Cohort 6,Subject 203


## Read Taxonomic Assignments from Kraken2
Collect the species-level read counts, and separate samples into placebo and treatment samples

In [7]:
# Load sequencing data
counts = pd.read_csv('../data/studyB_S_counts.csv').rename(columns = {'name':'species', 'fraction_total_reads':'abundance'})

# Match sample ID format to metadata
counts['sample_id'] = counts['sample_id'].str.replace(r'^S_', '', regex=True) 

# Merge with metadata 
counts = counts.merge(metadata, on='sample_id', how='left')

# Sort table
counts.sort_values(by = ['cohort','study_timepoint'], inplace = True)

# Isolate baseline samples 
counts_baseline = counts[(counts['study_timepoint'] == 'Day 06')&(counts['cohort'] != 'Cohort 6')]

# Rename incorrectly labeled species
counts_baseline['species'] = counts_baseline['species'].str.replace('oxytoca','michiganensis') 
counts_baseline['species'] = counts_baseline['species'].str.replace('Segatella','Prevotella')

# Remove unsed cohorts
counts_baseline = counts_baseline[(counts_baseline.cohort!='Cohort Vanco')&(counts_baseline.cohort!='Cohort Sentinel')]

# Format for MICOM
counts_baseline['id'] = counts_baseline['species'].str.replace(' ', '_') 
counts_baseline = counts_baseline.groupby(['sample_id','species','id','cohort','subject_id','study_timepoint']).sum(numeric_only = True).reset_index()
counts_baseline.drop(columns = 'sample_id', inplace = True)
counts_baseline.rename(columns = {'subject_id': 'sample_id'}, inplace = True)
counts_baseline = counts_baseline[['sample_id','species','id','abundance','cohort']]

# Remove sample without follow-up
counts_baseline = counts_baseline[counts_baseline.sample_id!=' Subject 102']

# Create sample_id to cohort dictionary
cohort_dict = counts_baseline.drop_duplicates(subset = 'sample_id').set_index('sample_id')['cohort'].to_dict()
counts_baseline.sample_id.nunique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  counts_baseline['species'] = counts_baseline['species'].str.replace('oxytoca','michiganensis')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  counts_baseline['species'] = counts_baseline['species'].str.replace('Segatella','Prevotella')


20

## Add Probiotics to Treatment Arm
Samples in the treatment arm are supplemented with a probiotic cocktail, wbf11

In [8]:
def add_all_probiotic(taxonomy):
    taxonomy_probiotic_total = pd.DataFrame()
    for sample in taxonomy['sample_id'].unique():
        taxonomy_reduced = taxonomy[taxonomy['sample_id'] == sample].copy()
        taxonomy_reduced['abundance'] = taxonomy_reduced['abundance']*(0.2)
        taxonomy_probiotic = pd.concat([taxonomy_reduced,pd.DataFrame({
                  'sample_id':[sample, sample, sample, 
                              sample, sample, sample], 
                  'species': ['Enterocloster bolteae', 'Anaerotruncus colihominis',
                              'Sellimonas intestinalis', '[Clostridium] symbiosum', 
                              'Dorea longicatena', 'Flavonifractor plautii'],
                  'abundance':[80/6]*6, 
                  'id':['Enterocloster_bolteae', 'Anaerotruncus_colihominis',
                              'Sellimonas_intestinalis', 'Clostridium_symbiosum', 
                              'Dorea_longicatena', 'Flavonifractor_plautii'],
                  'cohort':[taxonomy_reduced['cohort'].unique()[0]]*6})])
        taxonomy_probiotic_total = pd.concat([taxonomy_probiotic_total, taxonomy_probiotic])
        taxonomy_probiotic_total = taxonomy_probiotic_total.groupby(['sample_id','id','species','cohort']).sum().reset_index()

    return taxonomy_probiotic_total


counts_probiotic = add_all_probiotic(counts_baseline)

## Build Models
Build MCMMs for samples in the treatment arm and the placebo arm

In [9]:
# Build Models
agora = '../agora201_refseq216_species_1.qza'

# Treatment group models
micom.workflows.build(counts_probiotic, 
                                 model_db=agora, 
                                 out_folder='../VE303_treated',
                                 cutoff=0.001, 
                                 threads=10)

Output()

  taxonomy.groupby("sample_id")


Unnamed: 0,sample_id,cohort,file,found_taxa,total_taxa,found_fraction,found_abundance_fraction
0,Subject 100,Cohort 3,Subject 100.pickle,6.0,6.0,1.0,0.997507
1,Subject 110,Cohort 4,Subject 110.pickle,6.0,6.0,1.0,0.997507
2,Subject 111,Cohort 4,Subject 111.pickle,6.0,6.0,1.0,0.997507
3,Subject 126,Cohort 4,Subject 126.pickle,7.0,7.0,1.0,0.999186
4,Subject 132,Cohort 4,Subject 132.pickle,7.0,7.0,1.0,0.999233
5,Subject 133,Cohort 4,Subject 133.pickle,7.0,7.0,1.0,0.999092
6,Subject 144,Cohort 4,Subject 144.pickle,7.0,7.0,1.0,0.998614
7,Subject 153,Cohort 5,Subject 153.pickle,7.0,7.0,1.0,0.998509
8,Subject 156,Cohort 5,Subject 156.pickle,6.0,6.0,1.0,0.997506
9,Subject 157,Cohort 5,Subject 157.pickle,7.0,7.0,1.0,0.998735
