In [1]:
from glob import glob
import pandas as pd
from scipy.stats import mannwhitneyu
from statsmodels.sandbox.stats.multicomp import multipletests
import numpy as np
import seaborn as sns

sns.set_style('whitegrid')

def p_adjust(pvalues, method='fdr_bh'):
    res = multipletests(pvalues, method=method)
    return np.array(res[1], dtype=float)

# Correlate nasal 16S with vaccine response

##### Michael Shaffer
##### 7/21/22
##### Merck ESC, Sys bio group

To look for associations between the nasal microbiome and vaccine response we have calculated correlations between the abundances of individual OTUs and the continuous titer measurements from 1 year of life.

## Read in data

In [2]:
meta = pd.read_csv('../../data/metadata/nasal/nasal_metadata.csv', index_col='SampleID')
meta['age_at_collection'] = (pd.to_datetime(meta['CollectionDate']) - pd.to_datetime(meta['DOB'])).dt.days
meta = pd.concat([meta,
                  pd.read_csv('../../data/metadata/nasal/nasal_abx_usage.csv', index_col='SampleID'),
                  pd.read_csv('../../data/metadata/nasal/nasal_titers_yr1.csv', index_col='SampleID')],
                 axis=1)
meta = meta.loc[~pd.isna(meta['median_mmNorm'])]
meta.head()

Unnamed: 0_level_0,SubmissionType,SampleNumber,SampleIDValidation,DiversigenCheckInSampleName,ReplacesLowVolumeSampleID,BoxLocation,SampleType,SampleSource,SequencingType,BabyN,...,median_mmNorm_PCV,median_mmNorm_DTAPHib,protectNorm_Dip,protectNorm_TET,protectNorm_PRP (Hib),protectNorm_PT,protectNorm_PRN,protectNorm_FHA,geommean_protectNorm,VR_group_v2
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
106_V5_NS_A1,Primary in Tube,2,,,,"Box 1, A3",Nasal Swab,Human Infant,16S,106,...,0.061955,0.052874,2.1,3.0,2.6,0.3125,0.3125,1.375,1.140388,NVR
107_V2_NS_A1,Primary in Tube,3,,,,"Box 1, A4",Nasal Swab,Human Infant,16S,107,...,0.958142,0.114018,4.4,5.2,10.666667,0.3125,1.125,0.375,1.783418,NVR
107_V3_NS_A1,Primary in Tube,4,,107_V8_NS_A1,,"Box 1, A5",Nasal Swab,Human Infant,16S,107,...,0.958142,0.114018,4.4,5.2,10.666667,0.3125,1.125,0.375,1.783418,NVR
107_V5_NS_A1,Primary in Tube,5,,,,"Box 1, A8",Nasal Swab,Human Infant,16S,107,...,0.958142,0.114018,4.4,5.2,10.666667,0.3125,1.125,0.375,1.783418,NVR
108_V4_NS_A1,Primary in Tube,6,,,,"Box 1, A9",Nasal Swab,Human Infant,16S,108,...,0.003102,0.0,0.5,0.5,1.8,0.3125,0.3125,0.1875,0.44942,LVR


In [3]:
counts = pd.read_csv('../../data/nasal/otu_table.gt10_rar10K.tsv', sep='\t', index_col=0).transpose()
counts.head()

Unnamed: 0,101_S1_NS_A1,101_V3_NS_A1,101_V5_NS_A1,102_V1_NS_A1,102_V3_NS_A1,102_V5_NS_A1,102_V6_NS_A1,103_S1_NS_A1,103_S3_NS_A1,103_V10_NS_A1,...,MSA2002_5A,MSA2002_5B,MSA2002_6A,MSA2002_6B,MSA2002_7A,MSA2002_7B,MSA2002_8A,MSA2002_8B,MSA2002_9A,MSA2002_9B
Otu0001,1,0,0,1,2,0,2,1593,7320,606,...,3,4,1,1,0,2,1,1,1,0
Otu0002,5845,9876,692,557,783,509,6047,1,0,3,...,114,126,104,115,119,111,168,147,83,103
Otu0003,117,0,879,4392,1428,528,87,877,2642,1498,...,0,0,0,0,0,0,0,0,0,0
Otu0004,9,1,1104,1,6133,475,1,109,2,14,...,0,0,1,0,0,0,0,0,0,0
Otu0005,0,0,0,0,0,1,0,4173,24,3121,...,0,0,0,0,0,0,0,0,0,0


In [4]:
in_both = set(meta.index) & set(counts.columns)
meta = meta.loc[in_both].sort_values(['BabyN', 'age_at_collection'])
print(meta.shape)

(775, 84)


  meta = meta.loc[in_both].sort_values(['BabyN', 'age_at_collection'])


In [5]:
meta_v5 = meta.query("VisitCode == 'V5'")
counts_v5 = counts[meta_v5.index]
counts_v5 = counts_v5.loc[(counts_v5 > 0).sum(axis=1) > counts_v5.shape[1]*.2]

meta_v6 = meta.query("VisitCode == 'V6'")
counts_v6 = counts[meta_v6.index]
counts_v6 = counts_v6.loc[(counts_v6 > 0).sum(axis=1) > counts_v6.shape[1]*.2]

meta_v7 = meta.query("VisitCode == 'V7'")
counts_v7 = counts[meta_v7.index]
counts_v7 = counts_v7.loc[(counts_v7 > 0).sum(axis=1) > counts_v7.shape[1]*.2]

meta_v9 = meta.query("VisitCode == 'V9'")
counts_v9 = counts[meta_v9.index]
counts_v9 = counts_v9.loc[(counts_v9 > 0).sum(axis=1) > counts_v9.shape[1]*.2]

## Correlations with median titer values

We will use Spearman's R as our correlation metric and use OTU abundances from the 2 month (V5), 4 month (V6), 6 month (V7) and 1 year (V9) time points. 2, 4 and 6 months are when vaccinations are given and 1 year is when titers were measured.

In [6]:
counts_stats_v5_rows = list()
for otu, row in counts_v5.iterrows():
    lvr_abunds = row[meta_v5.query('VR_group == "LVR"').index]
    nvr_abunds = row[meta_v5.query('VR_group == "NVR"').index]
    # check for not all zeros
    # lvr_gt_20 = (lvr_abunds != 0).sum()/len(lvr_abunds) > .2
    # nvr_gt_20 = (nvr_abunds != 0).sum()/len(nvr_abunds) > .2
    lvr_gt_20 = lvr_abunds.sum()/len(lvr_abunds) > 10
    nvr_gt_20 = nvr_abunds.sum()/len(nvr_abunds) > 10
    if lvr_gt_20 or nvr_gt_20:
        stat, p_value = mannwhitneyu(lvr_abunds, nvr_abunds)
        counts_stats_v5_rows.append([otu, lvr_abunds.mean(), nvr_abunds.mean(), stat, p_value])
counts_stats_v5 = pd.DataFrame(counts_stats_v5_rows, columns=['OTU', 'LVR_mean', 'NVR_mean', 'statistic', 'p_value']).sort_values('p_value')
counts_stats_v5['p_adj'] = p_adjust(counts_stats_v5['p_value'])
counts_stats_v5.head()

Unnamed: 0,OTU,LVR_mean,NVR_mean,statistic,p_value,p_adj
19,Otu0029,2.5,35.719298,170.0,0.211242,0.911735
8,Otu0009,59.75,192.385965,165.0,0.211974,0.911735
17,Otu0025,3.25,30.175439,175.5,0.28283,0.911735
16,Otu0023,0.125,10.947368,186.0,0.283525,0.911735
11,Otu0015,32.0,53.333333,175.5,0.294685,0.911735


In [7]:
counts_stats_v6_rows = list()
for otu, row in counts_v6.iterrows():
    lvr_abunds = row[meta_v6.query('VR_group == "LVR"').index]
    nvr_abunds = row[meta_v6.query('VR_group == "NVR"').index]
    # check for not all zeros
    # lvr_gt_20 = (lvr_abunds != 0).sum()/len(lvr_abunds) > .2
    # nvr_gt_20 = (nvr_abunds != 0).sum()/len(nvr_abunds) > .2
    lvr_gt_20 = lvr_abunds.sum()/len(lvr_abunds) > 10
    nvr_gt_20 = nvr_abunds.sum()/len(nvr_abunds) > 10
    if lvr_gt_20 or nvr_gt_20:
        stat, p_value = mannwhitneyu(lvr_abunds, nvr_abunds)
        counts_stats_v6_rows.append([otu, lvr_abunds.mean(), nvr_abunds.mean(), stat, p_value])
counts_stats_v6 = pd.DataFrame(counts_stats_v6_rows, columns=['OTU', 'LVR_mean', 'NVR_mean', 'statistic', 'p_value']).sort_values('p_value')
counts_stats_v6['p_adj'] = p_adjust(counts_stats_v6['p_value'])
counts_stats_v6.head()

Unnamed: 0,OTU,LVR_mean,NVR_mean,statistic,p_value,p_adj
18,Otu0027,53.545455,13.711538,371.0,0.083314,0.933915
3,Otu0004,2494.545455,1272.884615,375.5,0.10636,0.933915
25,Otu0039,14.727273,17.826923,231.0,0.307787,0.933915
19,Otu0028,16.454545,11.076923,332.0,0.332889,0.933915
14,Otu0021,34.909091,49.923077,234.5,0.354753,0.933915


In [8]:
counts_stats_v7_rows = list()
for otu, row in counts_v7.iterrows():
    lvr_abunds = row[meta_v7.query('VR_group == "LVR"').index]
    nvr_abunds = row[meta_v7.query('VR_group == "NVR"').index]
    # check for not all zeros
    # lvr_gt_20 = (lvr_abunds != 0).sum()/len(lvr_abunds) > .2
    # nvr_gt_20 = (nvr_abunds != 0).sum()/len(nvr_abunds) > .2
    lvr_gt_20 = lvr_abunds.sum()/len(lvr_abunds) > 10
    nvr_gt_20 = nvr_abunds.sum()/len(nvr_abunds) > 10
    if lvr_gt_20 or nvr_gt_20:
        stat, p_value = mannwhitneyu(lvr_abunds, nvr_abunds)
        counts_stats_v7_rows.append([otu, lvr_abunds.mean(), nvr_abunds.mean(), stat, p_value])
counts_stats_v7 = pd.DataFrame(counts_stats_v7_rows, columns=['OTU', 'LVR_mean', 'NVR_mean', 'statistic', 'p_value']).sort_values('p_value')
counts_stats_v7['p_adj'] = p_adjust(counts_stats_v7['p_value'])
counts_stats_v7.head()

Unnamed: 0,OTU,LVR_mean,NVR_mean,statistic,p_value,p_adj
3,Otu0004,2081.181818,602.018519,426.5,0.02345,0.609704
17,Otu0025,26.636364,19.611111,392.0,0.091636,0.733972
20,Otu0034,14.909091,20.277778,389.0,0.103835,0.733972
4,Otu0005,1484.272727,1046.907407,371.5,0.112919,0.733972
12,Otu0016,27.636364,41.518519,378.0,0.150701,0.783644


In [9]:
counts_stats_v9_rows = list()
for otu, row in counts_v9.iterrows():
    lvr_abunds = row[meta_v9.query('VR_group == "LVR"').index]
    nvr_abunds = row[meta_v9.query('VR_group == "NVR"').index]
    # check for not all zeros
    # lvr_gt_20 = (lvr_abunds != 0).sum()/len(lvr_abunds) > .2
    # nvr_gt_20 = (nvr_abunds != 0).sum()/len(nvr_abunds) > .2
    lvr_gt_20 = lvr_abunds.sum()/len(lvr_abunds) > 10
    nvr_gt_20 = nvr_abunds.sum()/len(nvr_abunds) > 10
    if lvr_gt_20 or nvr_gt_20:
        stat, p_value = mannwhitneyu(lvr_abunds, nvr_abunds)
        counts_stats_v9_rows.append([otu, lvr_abunds.mean(), nvr_abunds.mean(), stat, p_value])
counts_stats_v9 = pd.DataFrame(counts_stats_v9_rows, columns=['OTU', 'LVR_mean', 'NVR_mean', 'statistic', 'p_value']).sort_values('p_value')
counts_stats_v9['p_adj'] = p_adjust(counts_stats_v9['p_value'])
counts_stats_v9.head()

Unnamed: 0,OTU,LVR_mean,NVR_mean,statistic,p_value,p_adj
7,Otu0008,95.888889,40.932203,400.0,0.012107,0.472181
24,Otu0036,3.666667,29.661017,183.0,0.122226,0.932017
3,Otu0004,1217.444444,599.20339,349.0,0.131345,0.932017
37,Otu0072,10.333333,1.457627,326.0,0.184336,0.932017
0,Otu0001,2485.555556,4115.779661,193.5,0.194697,0.932017
