In [1]:
from glob import glob
import pandas as pd
from scipy.stats import spearmanr
from statsmodels.sandbox.stats.multicomp import multipletests
import numpy as np
import seaborn as sns

sns.set_style('whitegrid')

def p_adjust(pvalues, method='fdr_bh'):
    res = multipletests(pvalues, method=method)
    return np.array(res[1], dtype=float)

# Correlate nasal 16S with vaccine response

##### Michael Shaffer
##### 7/21/22
##### Merck ESC, Sys bio group

To look for associations between the nasal microbiome and vaccine response we have calculated correlations between the abundances of individual OTUs and the continuous titer measurements from 1 year of life.

## Read in data

In [2]:
meta = pd.read_csv('../../data/metadata/nasal/nasal_metadata.csv', index_col='SampleID')
meta['age_at_collection'] = (pd.to_datetime(meta['CollectionDate']) - pd.to_datetime(meta['DOB'])).dt.days
meta = pd.concat([meta,
                  pd.read_csv('../../data/metadata/nasal/nasal_abx_usage.csv', index_col='SampleID'),
                  pd.read_csv('../../data/metadata/nasal/nasal_titers_yr1.csv', index_col='SampleID')],
                 axis=1)
meta = meta.loc[~pd.isna(meta['median_mmNorm'])]
meta.head()

Unnamed: 0_level_0,SubmissionType,SampleNumber,SampleIDValidation,DiversigenCheckInSampleName,ReplacesLowVolumeSampleID,BoxLocation,SampleType,SampleSource,SequencingType,BabyN,...,median_mmNorm_PCV,median_mmNorm_DTAPHib,protectNorm_Dip,protectNorm_TET,protectNorm_PRP (Hib),protectNorm_PT,protectNorm_PRN,protectNorm_FHA,geommean_protectNorm,VR_group_v2
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
106_V5_NS_A1,Primary in Tube,2,,,,"Box 1, A3",Nasal Swab,Human Infant,16S,106,...,0.061955,0.052874,2.1,3.0,2.6,0.3125,0.3125,1.375,1.140388,NVR
107_V2_NS_A1,Primary in Tube,3,,,,"Box 1, A4",Nasal Swab,Human Infant,16S,107,...,0.958142,0.114018,4.4,5.2,10.666667,0.3125,1.125,0.375,1.783418,NVR
107_V3_NS_A1,Primary in Tube,4,,107_V8_NS_A1,,"Box 1, A5",Nasal Swab,Human Infant,16S,107,...,0.958142,0.114018,4.4,5.2,10.666667,0.3125,1.125,0.375,1.783418,NVR
107_V5_NS_A1,Primary in Tube,5,,,,"Box 1, A8",Nasal Swab,Human Infant,16S,107,...,0.958142,0.114018,4.4,5.2,10.666667,0.3125,1.125,0.375,1.783418,NVR
108_V4_NS_A1,Primary in Tube,6,,,,"Box 1, A9",Nasal Swab,Human Infant,16S,108,...,0.003102,0.0,0.5,0.5,1.8,0.3125,0.3125,0.1875,0.44942,LVR


In [3]:
counts = pd.read_csv('../../data/nasal/otu_table.gt10_rar10K.tsv', sep='\t', index_col=0).transpose()
counts.head()

Unnamed: 0,101_S1_NS_A1,101_V3_NS_A1,101_V5_NS_A1,102_V1_NS_A1,102_V3_NS_A1,102_V5_NS_A1,102_V6_NS_A1,103_S1_NS_A1,103_S3_NS_A1,103_V10_NS_A1,...,MSA2002_5A,MSA2002_5B,MSA2002_6A,MSA2002_6B,MSA2002_7A,MSA2002_7B,MSA2002_8A,MSA2002_8B,MSA2002_9A,MSA2002_9B
Otu0001,1,0,0,1,2,0,2,1593,7320,606,...,3,4,1,1,0,2,1,1,1,0
Otu0002,5845,9876,692,557,783,509,6047,1,0,3,...,114,126,104,115,119,111,168,147,83,103
Otu0003,117,0,879,4392,1428,528,87,877,2642,1498,...,0,0,0,0,0,0,0,0,0,0
Otu0004,9,1,1104,1,6133,475,1,109,2,14,...,0,0,1,0,0,0,0,0,0,0
Otu0005,0,0,0,0,0,1,0,4173,24,3121,...,0,0,0,0,0,0,0,0,0,0


In [4]:
in_both = set(meta.index) & set(counts.columns)
meta = meta.loc[in_both].sort_values(['BabyN', 'age_at_collection'])
print(meta.shape)

(775, 84)


  meta = meta.loc[in_both].sort_values(['BabyN', 'age_at_collection'])


In [5]:
meta_v5 = meta.query("VisitCode == 'V5'")
counts_v5 = counts[meta_v5.index]
counts_v5 = counts_v5.loc[(counts_v5 > 0).sum(axis=1) > counts_v5.shape[1]*.2]

meta_v6 = meta.query("VisitCode == 'V6'")
counts_v6 = counts[meta_v6.index]
counts_v6 = counts_v6.loc[(counts_v6 > 0).sum(axis=1) > counts_v6.shape[1]*.2]

meta_v7 = meta.query("VisitCode == 'V7'")
counts_v7 = counts[meta_v7.index]
counts_v7 = counts_v7.loc[(counts_v7 > 0).sum(axis=1) > counts_v7.shape[1]*.2]

meta_v9 = meta.query("VisitCode == 'V9'")
counts_v9 = counts[meta_v9.index]
counts_v9 = counts_v9.loc[(counts_v9 > 0).sum(axis=1) > counts_v9.shape[1]*.2]

There are some samples which have DTAPHib titers but do not have PCV titers. So here we refilter everything to remove samples that do not have PCV titers for analysis of PCV titers specifically.

In [6]:
meta_PCV = meta.loc[~pd.isna(meta['median_mmNorm_PCV'])]
meta_PCV.head()

meta_PCV_v5 = meta_PCV.query("VisitCode == 'V5'")
counts_PCV_v5 = counts[meta_PCV_v5.index]
counts_PCV_v5 = counts_PCV_v5.loc[(counts_PCV_v5 > 0).sum(axis=1) > counts_PCV_v5.shape[1]*.2]

meta_PCV_v6 = meta_PCV.query("VisitCode == 'V6'")
counts_PCV_v6 = counts[meta_PCV_v6.index]
counts_PCV_v6 = counts_PCV_v6.loc[(counts_PCV_v6 > 0).sum(axis=1) > counts_PCV_v6.shape[1]*.2]

meta_PCV_v7 = meta_PCV.query("VisitCode == 'V7'")
counts_PCV_v7 = counts[meta_PCV_v7.index]
counts_PCV_v7 = counts_PCV_v7.loc[(counts_PCV_v7 > 0).sum(axis=1) > counts_PCV_v7.shape[1]*.2]

meta_PCV_v9 = meta_PCV.query("VisitCode == 'V9'")
counts_PCV_v9 = counts[meta_PCV_v9.index]
counts_PCV_v9 = counts_PCV_v9.loc[(counts_PCV_v9 > 0).sum(axis=1) > counts_PCV_v9.shape[1]*.2]

## Correlations with median titer values

We will use Spearman's R as our correlation metric and use OTU abundances from the 2 month (V5), 4 month (V6), 6 month (V7) and 1 year (V9) time points. 2, 4 and 6 months are when vaccinations are given and 1 year is when titers were measured.

In [7]:
v5_correlations = counts_v5.transpose().apply(spearmanr, b=meta_v5['median_mmNorm']).transpose()
v5_correlations.columns = ['rho', 'p_value']
v5_correlations['p_adj'] = p_adjust(v5_correlations['p_value'])
v5_correlations = v5_correlations.sort_values('p_value')
v5_correlations.head()

Unnamed: 0,rho,p_value,p_adj
Otu0104,-0.272545,0.02806,0.635003
Otu0023,0.257361,0.038485,0.635003
Otu0011,-0.211253,0.09117,0.675647
Otu0035,-0.207605,0.097031,0.675647
Otu0068,-0.204427,0.102371,0.675647


In [8]:
v6_correlations = counts_v6.transpose().apply(spearmanr, b=meta_v6['median_mmNorm']).transpose()
v6_correlations.columns = ['rho', 'p_value']
v6_correlations['p_adj'] = p_adjust(v6_correlations['p_value'])
v6_correlations = v6_correlations.sort_values('p_value')
v6_correlations.head()

Unnamed: 0,rho,p_value,p_adj
Otu0029,0.299024,0.017282,0.343967
Otu0007,-0.2976,0.017848,0.343967
Otu0140,0.270193,0.032222,0.343967
Otu0014,0.2684,0.033429,0.343967
Otu0025,-0.261925,0.038108,0.343967


In [9]:
v7_correlations = counts_v7.transpose().apply(spearmanr, b=meta_v7['median_mmNorm']).transpose()
v7_correlations.columns = ['rho', 'p_value']
v7_correlations['p_adj'] = p_adjust(v7_correlations['p_value'])
v7_correlations = v7_correlations.sort_values('p_value')
v7_correlations.head()

Unnamed: 0,rho,p_value,p_adj
Otu0140,-0.341066,0.005431,0.249816
Otu0027,-0.235725,0.058713,0.683525
Otu0047,0.215799,0.084261,0.683525
Otu0075,-0.214319,0.086465,0.683525
Otu0049,0.201552,0.107397,0.683525


In [10]:
v9_correlations = counts_v9.transpose().apply(spearmanr, b=meta_v9['median_mmNorm']).transpose()
v9_correlations.columns = ['rho', 'p_value']
v9_correlations['p_adj'] = p_adjust(v9_correlations['p_value'])
v9_correlations = v9_correlations.sort_values('p_value')
v9_correlations.head()

Unnamed: 0,rho,p_value,p_adj
Otu0065,0.305348,0.011339,0.424769
Otu0023,0.289933,0.016472,0.424769
Otu0016,0.272606,0.024508,0.424769
Otu0010,0.262082,0.03085,0.424769
Otu0115,0.261411,0.031297,0.424769


Across all test nothing is significant after multiple test correction. OTU 140 is significant raw at V6 and V7 but not strong enough to dig into further. The most raw significant results are at 1 year. Another potential indicator that titer is effected by factors present when it is measured?

## Correlations with median titer group values

Now we will split the titers into DTAPHib and PCV and test for significant correlations separately.

In [11]:
v5_DTAPHib_correlations = counts_v5.transpose().apply(spearmanr, b=meta_v5['median_mmNorm_DTAPHib']).transpose()
v5_DTAPHib_correlations.columns = ['rho', 'p_value']
v5_DTAPHib_correlations['p_adj'] = p_adjust(v5_DTAPHib_correlations['p_value'])
v5_DTAPHib_correlations = v5_DTAPHib_correlations.sort_values('p_value')
v5_DTAPHib_correlations.head()

Unnamed: 0,rho,p_value,p_adj
Otu0036,0.269762,0.029769,0.584765
Otu0029,0.260939,0.035777,0.584765
Otu0023,0.208451,0.095646,0.584765
Otu0022,0.207865,0.096603,0.584765
Otu0068,-0.205359,0.100782,0.584765


In [12]:
v5_PCV_correlations = counts_PCV_v5.transpose().apply(spearmanr, b=meta_PCV_v5['median_mmNorm_PCV']).transpose()
v5_PCV_correlations.columns = ['rho', 'p_value']
v5_PCV_correlations['p_adj'] = p_adjust(v5_PCV_correlations['p_value'])
v5_PCV_correlations = v5_PCV_correlations.sort_values('p_value')
v5_PCV_correlations.head()

Unnamed: 0,rho,p_value,p_adj
Otu0023,0.25691,0.04753,0.629053
Otu0104,-0.243666,0.06064,0.629053
Otu0011,-0.233991,0.071951,0.629053
Otu0035,-0.214262,0.100196,0.629053
Otu0079,-0.201801,0.122053,0.629053


In [13]:
v6_DTAPHib_correlations = counts_v6.transpose().apply(spearmanr, b=meta_v6['median_mmNorm_DTAPHib']).transpose()
v6_DTAPHib_correlations.columns = ['rho', 'p_value']
v6_DTAPHib_correlations['p_adj'] = p_adjust(v6_DTAPHib_correlations['p_value'])
v6_DTAPHib_correlations = v6_DTAPHib_correlations.sort_values('p_value')
v6_DTAPHib_correlations.head()

Unnamed: 0,rho,p_value,p_adj
Otu0059,-0.332116,0.007832,0.302081
Otu0134,0.317639,0.011188,0.302081
Otu0029,0.291052,0.020655,0.371783
Otu0040,0.247697,0.050314,0.679236
Otu0036,0.234684,0.064113,0.692419


In [14]:
v6_PCV_correlations = counts_PCV_v6.transpose().apply(spearmanr, b=meta_PCV_v6['median_mmNorm_PCV']).transpose()
v6_PCV_correlations.columns = ['rho', 'p_value']
v6_PCV_correlations['p_adj'] = p_adjust(v6_PCV_correlations['p_value'])
v6_PCV_correlations = v6_PCV_correlations.sort_values('p_value')
v6_PCV_correlations.head()

Unnamed: 0,rho,p_value,p_adj
Otu0029,0.329917,0.011435,0.383007
Otu0007,-0.303263,0.020662,0.383007
Otu0034,0.300091,0.022097,0.383007
Otu0025,-0.249917,0.058488,0.68797
Otu0040,0.237373,0.072787,0.68797


In [15]:
v7_DTAPHib_correlations = counts_v7.transpose().apply(spearmanr, b=meta_v7['median_mmNorm_DTAPHib']).transpose()
v7_DTAPHib_correlations.columns = ['rho', 'p_value']
v7_DTAPHib_correlations['p_adj'] = p_adjust(v7_DTAPHib_correlations['p_value'])
v7_DTAPHib_correlations = v7_DTAPHib_correlations.sort_values('p_value')
v7_DTAPHib_correlations.head()

Unnamed: 0,rho,p_value,p_adj
Otu0005,-0.314334,0.01077,0.495437
Otu0049,0.215649,0.084484,0.841144
Otu0140,-0.203925,0.103236,0.841144
Otu0015,-0.203734,0.103565,0.841144
Otu0037,-0.203222,0.104454,0.841144


In [16]:

v7_PCV_correlations = counts_PCV_v7.transpose().apply(spearmanr, b=meta_PCV_v7['median_mmNorm_PCV']).transpose()
v7_PCV_correlations.columns = ['rho', 'p_value']
v7_PCV_correlations['p_adj'] = p_adjust(v7_PCV_correlations['p_value'])
v7_PCV_correlations = v7_PCV_correlations.sort_values('p_value')
v7_PCV_correlations.head()

Unnamed: 0,rho,p_value,p_adj
Otu0140,-0.319534,0.012066,0.4853
Otu0010,0.294781,0.0211,0.4853
Otu0049,0.25307,0.04909,0.739043
Otu0047,0.235717,0.067432,0.739043
Otu0075,-0.211476,0.101825,0.739043


In [17]:
v9_DTAPHib_correlations = counts_v9.transpose().apply(spearmanr, b=meta_v9['median_mmNorm_DTAPHib']).transpose()
v9_DTAPHib_correlations.columns = ['rho', 'p_value']
v9_DTAPHib_correlations['p_adj'] = p_adjust(v9_DTAPHib_correlations['p_value'])
v9_DTAPHib_correlations = v9_DTAPHib_correlations.sort_values('p_value')
v9_DTAPHib_correlations.head()

Unnamed: 0,rho,p_value,p_adj
Otu0008,-0.374099,0.001674,0.113845
Otu0123,0.243936,0.045001,0.840422
Otu0095,-0.2357,0.052996,0.840422
Otu0122,0.235481,0.053223,0.840422
Otu0050,-0.227529,0.062038,0.840422


In [18]:
v9_PCV_correlations = counts_PCV_v9.transpose().apply(spearmanr, b=meta_PCV_v9['median_mmNorm_PCV']).transpose()
v9_PCV_correlations.columns = ['rho', 'p_value']
v9_PCV_correlations['p_adj'] = p_adjust(v9_PCV_correlations['p_value'])
v9_PCV_correlations = v9_PCV_correlations.sort_values('p_value')
v9_PCV_correlations.head()

Unnamed: 0,rho,p_value,p_adj
Otu0065,0.370817,0.002774,0.191372
Otu0023,0.323433,0.009719,0.335312
Otu0016,0.28606,0.023039,0.375104
Otu0092,0.276624,0.028186,0.375104
Otu0029,0.27577,0.028696,0.375104


Still nothing significant after multiple testing correction. Still most raw significance at 1 year (V9) but nothing consistent enough to get excited about despite the low p-values.