In [1]:
import micom
import pandas as pd
import numpy as np
import sklearn.metrics
import matplotlib.pyplot as plt
import seaborn as sns
from plotnine import *

%matplotlib inline

## Model Construction and Optimization
This notebook constructs sample-specific MCMMs for samples in the WBF011 study, to compare predicted probiotic engraftment against experimental data. 

## Collect Metadata

In [7]:
# Load the Metadata Tabe
metadata = pd.read_table('../data/Hiseq_metagenomic_202_190916 metadata_conditions.txt')
metadata = metadata.drop(metadata.index[[0,83,84]])

# Identify Subgroups
baseline = metadata[metadata['time_point']=='00_Baseline']['Name']
endpoint = metadata[metadata['time_point']=='12_Week']['Name']
treatment = metadata[metadata['treatment_group']=='wbf11']['Name']
placebo = metadata[metadata['treatment_group']=='placebo']['Name']
subject_dict = metadata.set_index('Name')['subject_id'].to_dict()
treatment_dict = metadata.set_index('Name')['treatment_group'].to_dict()

metadata

Unnamed: 0,Name,subject_id,time_point,treatment_group
1,C0223873,SS_65,00_Baseline,wbf11
2,C0253061,SS_60,00_Baseline,wbf11
3,C0227725,SS_46,00_Baseline,wbf11
4,C0264833,SS_31,12_Week,placebo
5,C0252289,SS_67,00_Baseline,wbf11
...,...,...,...,...
78,C0228492,SS_118,12_Week,placebo
79,C1064093,SS_102,12_Week,wbf11
80,C0231040,SS_76,12_Week,placebo
81,C0263808,SS_71,12_Week,placebo


## Read Taxonomic Assignments from Kraken2
Collect the species-level read counts, and separate samples into placebo and treatment samples

In [3]:
# Load the Kraken taxonomic assignment data
counts = pd.read_csv('../data/S_counts.csv', index_col=0)

# Isolate Baseline Samples
counts['sample'] = counts['sample'].str.split('_').str[0]
counts_baseline = counts[counts['sample'].isin(baseline)]

# Format for MICOM
counts_baseline.rename(columns = {'sample':'sample_id', 's':'species'}, inplace = True)
counts_baseline = counts_baseline.groupby(['sample_id','species']).sum(numeric_only = True).reset_index()
counts_baseline['tot_reads'] = counts_baseline.groupby('sample_id')['reads'].transform('sum')
counts_baseline['abundance'] = counts_baseline['reads']/counts_baseline['tot_reads']
counts_baseline['id'] = counts_baseline['species'].str.replace(' ','_')

# Isolate Treatment Group Samples
counts_treatment = counts_baseline[counts_baseline['sample_id'].isin(treatment)]

# Fix misannotation
counts_treatment['id'] = counts_treatment['id'].str.replace('_(ex_Wegman_et_al._2014)','')
counts_treatment['species'] = counts_treatment['species'].str.replace(' (ex Wegman et al. 2014)','')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A val

## Add Probiotics to Treatment Arm
Samples in the treatment arm are supplemented with a probiotic cocktail, wbf11

In [10]:
def add_all_probiotic(taxonomy):
    """Adds probiotic cocktail of AMUC, CBEI, CBUT, BINF, and EHAL to existing taxonomy table at 1% RA each"""
    taxonomy_probiotic_total = pd.DataFrame()
    for sample in taxonomy['sample_id'].unique():
        taxonomy_reduced = taxonomy[taxonomy['sample_id'] == sample].copy()
        taxonomy_reduced['abundance'] = taxonomy_reduced['abundance']*0.95
        taxonomy_probiotic = pd.concat([taxonomy_reduced,pd.DataFrame({
                  'sample_id':[sample, sample, sample, sample, sample], 
                  'species': ['Akkermansia muciniphila','Clostridium beijerinckii',
                              'Clostridium butyricum','Bifidobacterium longum',
                              'Anaerobutyricum hallii'],
                  'abundance':[0.01, 0.01, 0.01, 0.01, 0.01], 
                  'id':['Akkermansia_muciniphila','Clostridium_beijerinckii',
                              'Clostridium_butyricum','Bifidobacterium_longum',
                              'Anaerobutyricum_hallii']})])
        taxonomy_probiotic_total = pd.concat([taxonomy_probiotic_total, taxonomy_probiotic])

    return taxonomy_probiotic_total
# Add probiotics
counts_probiotic = add_all_probiotic(counts_treatment)

# Combine duplicate taxa
counts_probiotic = counts_probiotic.groupby(['sample_id','id','species']).sum().reset_index()

Unnamed: 0,sample_id,id,species,reads,tot_reads,abundance
0,C0219269,Acidaminococcus_fermentans,Acidaminococcus fermentans,10.0,3129594.0,0.000003
1,C0219269,Acidaminococcus_intestini,Acidaminococcus intestini,409.0,3129594.0,0.000124
2,C0219269,Actinomyces_naeslundii,Actinomyces naeslundii,38.0,3129594.0,0.000012
3,C0219269,Actinomyces_pacaensis,Actinomyces pacaensis,11.0,3129594.0,0.000003
4,C0219269,Actinomyces_sp._oral_taxon_414,Actinomyces sp. oral taxon 414,21.0,3129594.0,0.000006
...,...,...,...,...,...,...
6402,C0265088,[Clostridium]_scindens,[Clostridium] scindens,28371.0,16964251.0,0.001589
6403,C0265088,[Ruminococcus]_gnavus,[Ruminococcus] gnavus,850972.0,16964251.0,0.047655
6404,C0265088,[Ruminococcus]_lactaris,[Ruminococcus] lactaris,6290.0,16964251.0,0.000352
6405,C0265088,[Ruminococcus]_torques,[Ruminococcus] torques,118751.0,16964251.0,0.006650


## Build Models
Build MCMMs for samples in the treatment arm and the placebo arm

In [5]:
# Build Models
agora_augmented = '../agora_models/'

manifest_agora1 = micom.workflows.build(counts_probiotic, 
                                 model_db=agora_augmented, 
                                 out_folder='../WBF011_models',
                                 cutoff=0.001, 
                                 threads=10)


Output()

[2;36m                    [0m         could be matched to the model      [2m                [0m
[2;36m                    [0m         database. Model `C0223873` may not [2m                [0m
[2;36m                    [0m         be representative of the sample    [2m                [0m
[2;36m                    [0m         could be matched to the model      [2m                [0m
[2;36m                    [0m         database. Model `C0224389` may not [2m                [0m
[2;36m                    [0m         be representative of the sample    [2m                [0m
[2;36m                    [0m         could be matched to the model      [2m                [0m
[2;36m                    [0m         database. Model `C0232068` may not [2m                [0m
[2;36m                    [0m         be representative of the sample    [2m                [0m
[2;36m                    [0m         could be matched to the model      [2m                [0m


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

