In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from plotnine import *
import scipy.stats

%matplotlib inline

# Analysis of qPCR Data
This notebook analyzes the qPCR data of probiotic strains in participants, and identifies which show strong engraftment signals

## Collect Metadata

In [3]:
# Collect metadata 
metadata = pd.read_table('../data/Hiseq_metagenomic_202_190916 metadata_conditions.txt')

# Drop header and control rows
metadata = metadata.drop(metadata.index[[0,83,84]])

# Create dictionary with subject IDs
subject_dict = metadata.set_index('Name')['subject_id'].to_dict()
treatment_dict = metadata.set_index('Name')['treatment_group'].to_dict()

## Import qPCR Table
qPCR will serve as ground truth for validation of probiotic growth

In [4]:
# Read Table
qPCR = pd.read_table('../data/qPCR-reaction-table.tsv')

# Rename EHAL to AHAL 
qPCR['Strain'] = qPCR['Strain'].str.replace('EHAL','AHAL')
qPCR.head()

Unnamed: 0,Treatment,Strain,Primers,Week,Subject,PassQC,PassFilter,TmExpected,TmDiff,TemperatureQpcr,Ct
0,WBF-010,AMUC,AMUC_AT,0,SS_15,False,False,83.33,,,37.057988
1,WBF-010,AMUC,AMUC_AT,0,SS_15,False,False,83.33,,,-inf
2,WBF-010,AMUC,AMUC_AT,0,SS_15,False,False,83.33,,,37.390592
3,WBF-010,AMUC,AMUC_AT,0,SS_15,False,False,83.33,,,36.15199
4,Placebo,AMUC,AMUC_AT,4,SS_5,False,False,83.33,,,37.475383


## Filter Table 
Remove bad data and inefficient primers

In [5]:
# Remove cycle thresholds of -inf
qPCR_Ct = qPCR[qPCR['Ct']!= -np.inf].sort_values(by = 'Ct') 

# Remove reads that failed QC, filter
qPCR_filter = qPCR_Ct[(qPCR_Ct['PassQC'] == True)&(qPCR_Ct['PassFilter'] == True)]

# Remove less efficient primers for EHAL, CBEI
qPCR_filter = qPCR_filter.loc[qPCR['Primers'] != 'EHAL_AN']
qPCR_filter = qPCR_filter.loc[qPCR['Primers'] != 'CBEI_AB']

# Format column types
qPCR_filter['Week'] = qPCR_filter['Week'].astype('str')

# Sort table
qPCR_filter = qPCR_filter.sort_values(by = ['Strain','Week'])

qPCR_filter.Strain.unique()

array(['AHAL', 'AMUC', 'BINF', 'CBEI', 'CBUT'], dtype=object)

## Calculate Inverse Cycle Threshold
1/Ct serves as proxy for species abundance

In [None]:
# Calculate inverse
qPCR_filter['Ct_inv'] = 1/qPCR_filter['Ct']
qPCR_filter = qPCR_filter[qPCR_filter['Week']!= '16']
qPCR_filter = qPCR_filter[qPCR_filter['Treatment'] != 'WBF-010']

## Plot qPCR Data 
Plot 1/Ct across study arms for each strain and week

In [None]:
# Set all styling consistently BEFORE plotting
sns.set_theme(style="whitegrid")       # Ensures white grid
sns.set_context("talk", font_scale=1.2)  # Sets all font sizes reasonably large
sns.set_palette("colorblind")          # Applies colorblind-safe palette

# Now create your plots
treatments = np.array(['Placebo', 'WBF-011'], dtype='object')
fig, axes = plt.subplots(1, 2, figsize=(16, 9), sharey=True)

# Loop over treatments and make plots
for i, treatment in enumerate(treatments):
    ax = axes[i]

    treatment_data = qPCR_filter[qPCR_filter["Treatment"] == treatment]

    # Calculate strain thresholds
    strain_thresholds = treatment_data.loc[(treatment_data['Week'] == '0')].groupby('Strain')['Ct_inv'].mean().to_dict()

    # Manually input strain threshold for BINF to match WBF-011 baseline (no hits in placebo)
    if len(strain_thresholds) < 5:
        strain_thresholds['BINF'] = 0.028

    # Map Strain Thresholds
    treatment_data['threshold'] = treatment_data['Strain'].map(strain_thresholds)

    treatment_data = treatment_data.groupby(['Treatment','Strain','Week','Subject']).mean(numeric_only=True).reset_index()
    treatment_data['Week'] = treatment_data['Week'].astype(int)
    treatment_data = treatment_data.sort_values(by=['Strain','Week'])
    treatment_data['Week'] = treatment_data['Week'].astype(str)

    # Plot
    sns.violinplot(data=treatment_data, x="Strain", y="Ct_inv", hue="Week", 
                   split=False, alpha=0.5, inner=None, ax=ax, cut=0.01)
    sns.stripplot(data=treatment_data, x="Strain", y="Ct_inv", hue="Week", 
                  dodge=True, size=5, jitter=0.4, ax=ax, legend=False)

    ax.set(ylabel='')
    ax.set_title(f"Detection Over Time - {treatment}")
    # Add threshold lines 
    x_positions = {strain: idx for idx, strain in enumerate(treatment_data["Strain"].unique())}
    for strain in x_positions:
        x_pos = x_positions[strain]
        ax.hlines(y=strain_thresholds[strain], xmin=x_pos - 0.5, xmax=x_pos + 0.5, 
                  color='red', linestyle='--', linewidth=1.5)

fig.supylabel('1/Ct Value')
plt.tight_layout()
plt.savefig("../figures/qPCR_threshold.svg", dpi=300, bbox_inches="tight")
plt.show()


## Calculate Signficance For Treatment Group
Use non-parametric Mann-Whitney test

In [None]:
import scipy.stats


for strain in qPCR_filter['Strain'].unique():
    res = scipy.stats.mannwhitneyu(treatment_data[(treatment_data['Strain'] == strain)&(treatment_data['Week']==0)]['Ct_inv'],
                       treatment_data[(treatment_data['Strain'] == strain)&(treatment_data['Week']==12)]['Ct_inv'])
    print(strain+': '+str(res[1]))