In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats

%matplotlib inline

# Analysis of Metagenomic Data
This notebook analyzes the qPCR data of probiotic strains in participants in Validation Study A, and identifies which strains show strong engraftment signals

## Collect Metadata

In [None]:
# Read Metadata 
metadata = pd.read_csv('../data/studyB_metadata.csv')

# Split identifiers into separate columns
metadata[['cohort','subject_id']] = metadata['Cohort;_Subject_ID'].str.split(';', expand = True) 

# Isolate columns of interest
metadata_df = metadata[['Run','study_timepoint','cohort', 'subject_id']].rename(columns = {'Run':'sample_id'}) 

# Create subject dictionary
subject_dict = metadata_df.set_index('sample_id')['subject_id'].to_dict()

## Import Metagenomic Data

In [None]:
# Load sequencing data
s_counts = pd.read_csv('../data/studyB_S_counts.csv')

# Match sample ID format to metadata
s_counts['sample_id'] = s_counts['sample_id'].str.replace(r'^S_', '', regex=True) 

# Merge with metadata 
s_counts = s_counts.merge(metadata_df, on='sample_id', how='left')

# Sort table
s_counts.sort_values(by = ['cohort','study_timepoint'], inplace = True)



## Filter Metagenomes

In [None]:
# Isolate probiotic species 
abundance = s_counts[s_counts['name'].str.contains('|'.join(['Enterocloster bolteae', 'Anaerotruncus colihominis',
                              'Sellimonas intestinalis', 'symbiosum', 
                              'Dorea longicatena', 
                              'Flavonifractor plautii']))]
abundance = abundance[abundance['cohort'].str.contains('|'.join(['1','2','3','4','5']))]

# Rename columns
abundance.rename(columns = {'name':'species','fraction_total_reads':'abundance'}, inplace = True)

# Isolate timepoints
abundance = abundance[(abundance['study_timepoint'] == 'Day 0')|(abundance['study_timepoint'] == 'Week 12')]

# Log transform
abundance['abundance_log10'] = np.log10(abundance['abundance']+0.000001)

# Format
abundance['study_timepoint'] = abundance['study_timepoint'].str.split(' ').str[1]
abundance.rename(columns = {'species':'Species'}, inplace = True)

abundance.head()

## Plot Metagenome Data 
Plot relative abundance across timepoints

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Set all styling consistently BEFORE plotting
sns.set_theme(style="whitegrid")       # Ensures white grid
sns.set_context("talk", font_scale=2)  # Sets all font sizes reasonably large
sns.set_palette("colorblind")          # Applies colorblind-safe palette

# Now create your plots
fig, ax = plt.subplots(1, 1, figsize=(16, 9), sharey=True)

# Loop over treatments and make plots

treatment_data = abundance.copy()

treatment_data['Species']=treatment_data['Species'].str.replace(
    'Anaerotruncus colihominis','A. colihominis').str.replace(
    'Dorea longicatena','D. longicatena').str.replace(
    'Enterocloster bolteae','E. bolteae').str.replace(
    'Flavonifractor plautii','F. plautii').str.replace(
    'Sellimonas intestinalis','S.intestinalis').str.replace(
    '[Clostridium] symbiosum','C. symbiosum')

treatment_data = treatment_data.groupby(['Species','study_timepoint','subject_id']).mean(numeric_only=True).reset_index()
treatment_data = treatment_data.sort_values(by=['Species','study_timepoint'])

strain_thresholds = treatment_data.loc[(treatment_data['study_timepoint'] == 'Day 0')].groupby('Species')['abundance'].mean().to_dict()

treatment_data['threshold'] = treatment_data['Species'].map(strain_thresholds)
treatment_data.rename(columns = {'study_timepoint':'Week'}, inplace = True)
# ax.set_yscale("log")
sns.violinplot(data=treatment_data, x="Species", y="abundance_log10", hue="Week", 
               split=False, alpha=0.5, inner=None, ax=ax, cut=0.01)
sns.stripplot(data=treatment_data, x="Species", y="abundance_log10", hue="Week", 
              dodge=True, size=5, jitter=0.4, ax=ax, legend=False)

# Set log scale for y-axis


ax.set(ylabel='')
ax.set_title(f"Metagenomic Abundance Over Time")
ax.set_ylim(-6, 1)
ax.legend(loc="upper left", bbox_to_anchor=(0, 1))

x_positions = {strain: idx for idx, strain in enumerate(treatment_data["Species"].unique())}
for strain in x_positions:
    x_pos = x_positions[strain]
    ax.hlines(y=-2.3010299957, xmin=x_pos - 0.5, xmax=x_pos + 0.5, 
              color='red', linestyle='--', linewidth=1.5)

fig.supylabel('Relative Abundance \n(log scale)', multialignment='center')
plt.tight_layout()
plt.xticks(rotation=30, ha = 'right')

plt.savefig("../figures/metagenomes.png", dpi=300, bbox_inches="tight")
plt.savefig("../figures/metagenomes.svg", dpi=300, bbox_inches="tight")
plt.show()

## Calculate Enrichment Signficance

In [None]:
for species in abundance['Species'].unique():
    t0 = abundance[(abundance['Species'] == species)&(abundance['study_timepoint']=='0')]['abundance_log10']
    t1 = abundance[(abundance['Species'] == species)&(abundance['study_timepoint']=='12')]['abundance_log10']
    if len(t0) == 1:
        res = scipy.stats.ttest_1samp(t1, t0)
    elif len(t1) == 1:
        res = scipy.stats.ttest_1samp(t0, t1)
    else:
        res = scipy.stats.ttest_ind(t0, t1)
    print(species+': '+res[1].round(6).astype('str'))