In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats

%matplotlib inline

# Analysis of qPCR Data
This notebook analyzes the qPCR data of probiotic strains in participants, and identifies which show strong engraftment signals

## Collect Metadata

In [None]:
# Collect metadata 
metadata = pd.read_table('../data/studyA_metadata.tsv')

# Drop header and control rows
metadata = metadata.drop(metadata.index[[0,83,84]])

# Create dictionary with subject IDs
subject_dict = metadata.set_index('Name')['subject_id'].to_dict()
treatment_dict = metadata.set_index('Name')['treatment_group'].to_dict()

## Import qPCR Table
qPCR will serve as ground truth for validation of probiotic growth

In [None]:
# Read Table
qPCR = pd.read_table('../data/qPCR-reaction-table.tsv')

# Rename EHAL to AHAL 
qPCR['Strain'] = qPCR['Strain'].str.replace('EHAL','AHAL')
qPCR.head()

## Filter Table 
Remove bad data and inefficient primers

In [None]:
# Remove cycle thresholds of -inf
qPCR_Ct = qPCR[qPCR['Ct']!= -np.inf].sort_values(by = 'Ct') 

# Remove reads that failed QC, filter
qPCR_filter = qPCR_Ct[(qPCR_Ct['PassQC'] == True)&(qPCR_Ct['PassFilter'] == True)]

# Remove less efficient primers for EHAL, CBEI
qPCR_filter = qPCR_filter.loc[qPCR['Primers'] != 'EHAL_AN']
qPCR_filter = qPCR_filter.loc[qPCR['Primers'] != 'CBEI_AB']

# Format column types
qPCR_filter['Week'] = qPCR_filter['Week'].astype('str')

# Sort table
qPCR_filter = qPCR_filter.sort_values(by = ['Strain','Week'])

qPCR_filter.Strain.unique()

## Calculate Inverse Cycle Threshold
1/Ct serves as proxy for species abundance

In [None]:
# Calculate inverse
qPCR_filter['-Ct'] = -1*qPCR_filter['Ct']
qPCR_filter = qPCR_filter[(qPCR_filter['Week']!= '16')&(qPCR_filter['Week']!= '4')]
qPCR_filter = qPCR_filter[qPCR_filter['Treatment'] != 'WBF-010']
species_dict = {'AHAL':'A. hallii',
                'AMUC':'A. muciniphila',
                'BINF':'B. infantis',
                'CBUT':'C. butyricum',
                'CBEI':'C. beijerinckii'}
qPCR_filter['Species'] = qPCR_filter['Strain'].map(species_dict)
qPCR_filter

## Plot qPCR Data 
Plot -Ct across timepoints

In [None]:
sns.set_theme(style="whitegrid")
sns.set_context("talk", font_scale=2)
sns.set_palette("colorblind")

fig, ax = plt.subplots(figsize=(16, 9))

treatment_data = qPCR_filter[qPCR_filter['Treatment'] == 'WBF-011']

# Compute thresholds from baseline (Week 0)
strain_thresholds = (
    treatment_data.query("Week == '0'")
    .groupby("Species")["-Ct"]
    .mean()
    .to_dict()
)

# Manual fallback
strain_thresholds.setdefault("BINF", 0.028)

# Aggregate
treatment_data = (
    treatment_data
    .groupby(["Treatment", "Species", "Week", "Subject"], as_index=False)
    .mean(numeric_only=True)
)

sns.violinplot(
    data=treatment_data,
    x="Species",
    y="-Ct",
    hue="Week",
    inner=None,
    cut=0.01,
    alpha=0.5,
    ax=ax
)

sns.stripplot(
    data=treatment_data,
    x="Species",
    y="-Ct",
    hue="Week",
    dodge=True,
    jitter=0.4,
    size=5,
    legend=False,
    ax=ax
)

# Add thresholds
species_order = treatment_data["Species"].unique()
for i, species in enumerate(species_order):
    if species in strain_thresholds:
        ax.hlines(
            y=strain_thresholds[species],
            xmin=i - 0.4,
            xmax=i + 0.4,
            color="red",
            linestyle="--",
            linewidth=2.5,
            zorder=10
        )

ax.set_title(f"qPCR Detection Over Time")
ax.set_ylabel("-Ct Value")
ax.set_xlabel("Species")
plt.xticks(rotation=30, ha="right")

plt.tight_layout()
plt.show()


## Calculate Enrichment Signficance

In [None]:
for species in qPCR_filter['Species'].unique():
    t0 = treatment_data[(treatment_data['Species'] == species)&(treatment_data['Week']=='0')]['-Ct']
    t1 = treatment_data[(treatment_data['Species'] == species)&(treatment_data['Week']=='12')]['-Ct']
    if len(t0) == 1:
        res = scipy.stats.ttest_1samp(t1, t0)
    elif len(t1) == 1:
        res = scipy.stats.ttest_1samp(t0, t1)
    else:
        res = scipy.stats.ttest_ind(t0, t1)
    print(species+': '+res[1].round(6).astype('str'))