In [1]:
from glob import glob
import pandas as pd
from scipy.stats import spearmanr
from statsmodels.sandbox.stats.multicomp import multipletests
import numpy as np
import seaborn as sns

sns.set_style('whitegrid')

def p_adjust(pvalues, method='fdr_bh'):
    res = multipletests(pvalues, method=method)
    return np.array(res[1], dtype=float)

# Correlate nasal 16S with vaccine response

##### Michael Shaffer
##### 7/21/22
##### Merck ESC, Sys bio group

To look for associations between the nasal microbiome and vaccine response we have calculated correlations between the abundances of individual OTUs and the continuous titer measurements from 1 year of life.

## Read in data

In [2]:
meta = pd.read_csv('../../data/metadata/nasal/nasal_metadata.csv', index_col='SampleID')
meta['age_at_collection'] = (pd.to_datetime(meta['CollectionDate']) - pd.to_datetime(meta['DOB'])).dt.days
meta = pd.concat([meta,
                  pd.read_csv('../../data/metadata/nasal/nasal_abx_usage.csv', index_col='SampleID'),
                  pd.read_csv('../../data/metadata/nasal/nasal_titers_yr2.csv', index_col='SampleID')],
                 axis=1)
meta = meta.loc[~pd.isna(meta['median_mmNorm'])]
meta.head()

Unnamed: 0_level_0,SubmissionType,SampleNumber,SampleIDValidation,DiversigenCheckInSampleName,ReplacesLowVolumeSampleID,BoxLocation,SampleType,SampleSource,SequencingType,BabyN,...,PCV ST9V_mmNorm,PCV ST14_mmNorm,PCV ST18C_mmNorm,PCV ST19A_mmNorm,PCV ST19F_mmNorm,PCV ST23F_mmNorm,median_mmNorm,median_mmNorm_DTAPHib,median_mmNorm_PCV,VR_group
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
103_V5_NS_A1,Primary in Tube,1,,,,"Box 1, A1",Nasal Swab,Human Infant,16S,103,...,0.277753,0.146465,0.086356,0.035467,0.057453,0.30091,0.190854,0.326317,0.146465,NVR
108_V4_NS_A1,Primary in Tube,6,,,,"Box 1, A9",Nasal Swab,Human Infant,16S,108,...,0.871843,0.366194,0.164958,0.041692,0.06359,0.371471,0.16843,0.172291,0.164958,NVR
108_V5_NS_A1,Primary in Tube,7,,,,"Box 1, B1",Nasal Swab,Human Infant,16S,108,...,0.871843,0.366194,0.164958,0.041692,0.06359,0.371471,0.16843,0.172291,0.164958,NVR
117_V3_NS_A1,Primary in Tube,12,,,,"Box 1, E1",Nasal Swab,Human Infant,16S,117,...,0.082399,0.1966,0.11454,0.007407,0.003003,0.015946,0.11454,0.283004,0.066531,NVR
202_V5_NS_A1,Primary in Tube,14,,,,"Box 1, E5",Nasal Swab,Human Infant,16S,202,...,0.353074,0.061921,0.156257,0.175745,0.058983,0.154029,0.086954,0.0,0.154029,LVR


In [3]:
counts = pd.read_csv('../../data/nasal/otu_table.gt10_rar10K.tsv', sep='\t', index_col=0).transpose()
counts.head()

Unnamed: 0,101_S1_NS_A1,101_V3_NS_A1,101_V5_NS_A1,102_V1_NS_A1,102_V3_NS_A1,102_V5_NS_A1,102_V6_NS_A1,103_S1_NS_A1,103_S3_NS_A1,103_V10_NS_A1,...,MSA2002_5A,MSA2002_5B,MSA2002_6A,MSA2002_6B,MSA2002_7A,MSA2002_7B,MSA2002_8A,MSA2002_8B,MSA2002_9A,MSA2002_9B
Otu0001,1,0,0,1,2,0,2,1593,7320,606,...,3,4,1,1,0,2,1,1,1,0
Otu0002,5845,9876,692,557,783,509,6047,1,0,3,...,114,126,104,115,119,111,168,147,83,103
Otu0003,117,0,879,4392,1428,528,87,877,2642,1498,...,0,0,0,0,0,0,0,0,0,0
Otu0004,9,1,1104,1,6133,475,1,109,2,14,...,0,0,1,0,0,0,0,0,0,0
Otu0005,0,0,0,0,0,1,0,4173,24,3121,...,0,0,0,0,0,0,0,0,0,0


In [4]:
in_both = set(meta.index) & set(counts.columns)
meta = meta.loc[in_both].sort_values(['BabyN', 'age_at_collection'])
print(meta.shape)

(645, 70)


  meta = meta.loc[in_both].sort_values(['BabyN', 'age_at_collection'])


In [5]:
meta_v5 = meta.query("VisitCode == 'V5'")
counts_v5 = counts[meta_v5.index]
counts_v5 = counts_v5.loc[(counts_v5 > 0).sum(axis=1) > counts_v5.shape[1]*.2]

meta_v6 = meta.query("VisitCode == 'V6'")
counts_v6 = counts[meta_v6.index]
counts_v6 = counts_v6.loc[(counts_v6 > 0).sum(axis=1) > counts_v6.shape[1]*.2]

meta_v7 = meta.query("VisitCode == 'V7'")
counts_v7 = counts[meta_v7.index]
counts_v7 = counts_v7.loc[(counts_v7 > 0).sum(axis=1) > counts_v7.shape[1]*.2]

meta_v9 = meta.query("VisitCode == 'V9'")
counts_v9 = counts[meta_v9.index]
counts_v9 = counts_v9.loc[(counts_v9 > 0).sum(axis=1) > counts_v9.shape[1]*.2]

There are some samples which have DTAPHib titers but do not have PCV titers. So here we refilter everything to remove samples that do not have PCV titers for analysis of PCV titers specifically.

In [6]:
meta_PCV = meta.loc[~pd.isna(meta['median_mmNorm_PCV'])]
meta_PCV.head()

meta_PCV_v5 = meta_PCV.query("VisitCode == 'V5'")
counts_PCV_v5 = counts[meta_PCV_v5.index]
counts_PCV_v5 = counts_PCV_v5.loc[(counts_PCV_v5 > 0).sum(axis=1) > counts_PCV_v5.shape[1]*.2]

meta_PCV_v6 = meta_PCV.query("VisitCode == 'V6'")
counts_PCV_v6 = counts[meta_PCV_v6.index]
counts_PCV_v6 = counts_PCV_v6.loc[(counts_PCV_v6 > 0).sum(axis=1) > counts_PCV_v6.shape[1]*.2]

meta_PCV_v7 = meta_PCV.query("VisitCode == 'V7'")
counts_PCV_v7 = counts[meta_PCV_v7.index]
counts_PCV_v7 = counts_PCV_v7.loc[(counts_PCV_v7 > 0).sum(axis=1) > counts_PCV_v7.shape[1]*.2]

meta_PCV_v9 = meta_PCV.query("VisitCode == 'V9'")
counts_PCV_v9 = counts[meta_PCV_v9.index]
counts_PCV_v9 = counts_PCV_v9.loc[(counts_PCV_v9 > 0).sum(axis=1) > counts_PCV_v9.shape[1]*.2]

## Correlations with median titer values

We will use Spearman's R as our correlation metric and use OTU abundances from the 2 month (V5), 4 month (V6), 6 month (V7) and 1 year (V9) time points. 2, 4 and 6 months are when vaccinations are given and 1 year is when titers were measured.

In [7]:
v5_correlations = counts_v5.transpose().apply(spearmanr, b=meta_v5['median_mmNorm']).transpose()
v5_correlations.columns = ['rho', 'p_value']
v5_correlations['p_adj'] = p_adjust(v5_correlations['p_value'])
v5_correlations = v5_correlations.sort_values('p_value')
v5_correlations.head()

Unnamed: 0,rho,p_value,p_adj
Otu0037,-0.316588,0.026674,0.368315
Otu0006,0.309796,0.030297,0.368315
Otu0035,-0.307117,0.031833,0.368315
Otu0079,-0.300463,0.035933,0.368315
Otu0009,-0.280625,0.050807,0.4054


In [8]:
v6_correlations = counts_v6.transpose().apply(spearmanr, b=meta_v6['median_mmNorm']).transpose()
v6_correlations.columns = ['rho', 'p_value']
v6_correlations['p_adj'] = p_adjust(v6_correlations['p_value'])
v6_correlations = v6_correlations.sort_values('p_value')
v6_correlations.head()

Unnamed: 0,rho,p_value,p_adj
Otu0009,-0.284144,0.052911,0.831504
Otu0002,-0.260329,0.077181,0.831504
Otu0050,-0.25428,0.084565,0.831504
Otu0015,-0.247202,0.093894,0.831504
Otu0011,-0.241853,0.101457,0.831504


In [9]:
v7_correlations = counts_v7.transpose().apply(spearmanr, b=meta_v7['median_mmNorm']).transpose()
v7_correlations.columns = ['rho', 'p_value']
v7_correlations['p_adj'] = p_adjust(v7_correlations['p_value'])
v7_correlations = v7_correlations.sort_values('p_value')
v7_correlations.head()

Unnamed: 0,rho,p_value,p_adj
Otu0049,0.325078,0.021253,0.73654
Otu0068,-0.288976,0.041817,0.73654
Otu0006,-0.278456,0.050219,0.73654
Otu0039,-0.255738,0.073046,0.77184
Otu0004,-0.217258,0.129632,0.77184


In [10]:
v9_correlations = counts_v9.transpose().apply(spearmanr, b=meta_v9['median_mmNorm']).transpose()
v9_correlations.columns = ['rho', 'p_value']
v9_correlations['p_adj'] = p_adjust(v9_correlations['p_value'])
v9_correlations = v9_correlations.sort_values('p_value')
v9_correlations.head()

Unnamed: 0,rho,p_value,p_adj
Otu0067,-0.252197,0.065804,0.919694
Otu0123,0.212457,0.122983,0.919694
Otu0040,0.208318,0.130627,0.919694
Otu0090,0.202293,0.142383,0.919694
Otu0014,0.199677,0.147727,0.919694


Across all test nothing is significant after multiple test correction. OTU 140 is significant raw at V6 and V7 but not strong enough to dig into further. The most raw significant results are at 1 year. Another potential indicator that titer is effected by factors present when it is measured?

## Correlations with median titer group values

Now we will split the titers into DTAPHib and PCV and test for significant correlations separately.

In [11]:
v5_DTAPHib_correlations = counts_v5.transpose().apply(spearmanr, b=meta_v5['median_mmNorm_DTAPHib']).transpose()
v5_DTAPHib_correlations.columns = ['rho', 'p_value']
v5_DTAPHib_correlations['p_adj'] = p_adjust(v5_DTAPHib_correlations['p_value'])
v5_DTAPHib_correlations = v5_DTAPHib_correlations.sort_values('p_value')
v5_DTAPHib_correlations.head()

Unnamed: 0,rho,p_value,p_adj
Otu0039,-0.279394,0.051874,0.987112
Otu0029,0.25739,0.074191,0.987112
Otu0068,-0.219554,0.129578,0.987112
Otu0001,0.213858,0.140081,0.987112
Otu0025,-0.198642,0.171226,0.987112


In [12]:
v5_PCV_correlations = counts_PCV_v5.transpose().apply(spearmanr, b=meta_PCV_v5['median_mmNorm_PCV']).transpose()
v5_PCV_correlations.columns = ['rho', 'p_value']
v5_PCV_correlations['p_adj'] = p_adjust(v5_PCV_correlations['p_value'])
v5_PCV_correlations = v5_PCV_correlations.sort_values('p_value')
v5_PCV_correlations.head()

Unnamed: 0,rho,p_value,p_adj
Otu0009,-0.381482,0.00684,0.180739
Otu0079,-0.37031,0.008817,0.180739
Otu0035,-0.316148,0.026897,0.208397
Otu0021,-0.308617,0.030965,0.208397
Otu0008,-0.305447,0.032824,0.208397


In [13]:
v6_DTAPHib_correlations = counts_v6.transpose().apply(spearmanr, b=meta_v6['median_mmNorm_DTAPHib']).transpose()
v6_DTAPHib_correlations.columns = ['rho', 'p_value']
v6_DTAPHib_correlations['p_adj'] = p_adjust(v6_DTAPHib_correlations['p_value'])
v6_DTAPHib_correlations = v6_DTAPHib_correlations.sort_values('p_value')
v6_DTAPHib_correlations.head()

Unnamed: 0,rho,p_value,p_adj
Otu0002,-0.342277,0.018524,0.77051
Otu0079,-0.272942,0.063421,0.77051
Otu0063,0.232247,0.116206,0.77051
Otu0001,0.224774,0.128766,0.77051
Otu0026,-0.22013,0.137066,0.77051


In [14]:
v6_PCV_correlations = counts_PCV_v6.transpose().apply(spearmanr, b=meta_PCV_v6['median_mmNorm_PCV']).transpose()
v6_PCV_correlations.columns = ['rho', 'p_value']
v6_PCV_correlations['p_adj'] = p_adjust(v6_PCV_correlations['p_value'])
v6_PCV_correlations = v6_PCV_correlations.sort_values('p_value')
v6_PCV_correlations.head()

Unnamed: 0,rho,p_value,p_adj
Otu0009,-0.320176,0.028234,0.85747
Otu0021,-0.282048,0.054763,0.85747
Otu0008,-0.27126,0.065134,0.85747
Otu0011,-0.252378,0.086997,0.85747
Otu0035,-0.246389,0.095015,0.85747


In [15]:
v7_DTAPHib_correlations = counts_v7.transpose().apply(spearmanr, b=meta_v7['median_mmNorm_DTAPHib']).transpose()
v7_DTAPHib_correlations.columns = ['rho', 'p_value']
v7_DTAPHib_correlations['p_adj'] = p_adjust(v7_DTAPHib_correlations['p_value'])
v7_DTAPHib_correlations = v7_DTAPHib_correlations.sort_values('p_value')
v7_DTAPHib_correlations.head()

Unnamed: 0,rho,p_value,p_adj
Otu0044,0.266439,0.061439,0.997358
Otu0034,-0.253045,0.076225,0.997358
Otu0072,-0.234675,0.100916,0.997358
Otu0036,-0.220739,0.123451,0.997358
Otu0050,-0.208518,0.146173,0.997358


In [16]:

v7_PCV_correlations = counts_PCV_v7.transpose().apply(spearmanr, b=meta_PCV_v7['median_mmNorm_PCV']).transpose()
v7_PCV_correlations.columns = ['rho', 'p_value']
v7_PCV_correlations['p_adj'] = p_adjust(v7_PCV_correlations['p_value'])
v7_PCV_correlations = v7_PCV_correlations.sort_values('p_value')
v7_PCV_correlations.head()

Unnamed: 0,rho,p_value,p_adj
Otu0049,0.392223,0.004845,0.210327
Otu0006,-0.357307,0.010855,0.210327
Otu0004,-0.344334,0.01434,0.210327
Otu0002,-0.232003,0.104971,0.740988
Otu0039,-0.228355,0.110705,0.740988


In [17]:
v9_DTAPHib_correlations = counts_v9.transpose().apply(spearmanr, b=meta_v9['median_mmNorm_DTAPHib']).transpose()
v9_DTAPHib_correlations.columns = ['rho', 'p_value']
v9_DTAPHib_correlations['p_adj'] = p_adjust(v9_DTAPHib_correlations['p_value'])
v9_DTAPHib_correlations = v9_DTAPHib_correlations.sort_values('p_value')
v9_DTAPHib_correlations.head()

Unnamed: 0,rho,p_value,p_adj
Otu0093,-0.346945,0.010163,0.741886
Otu0008,-0.287901,0.034768,0.968702
Otu0002,-0.27483,0.044301,0.968702
Otu0095,-0.240592,0.079696,0.968702
Otu0003,0.211718,0.124323,0.968702


In [18]:
v9_PCV_correlations = counts_PCV_v9.transpose().apply(spearmanr, b=meta_PCV_v9['median_mmNorm_PCV']).transpose()
v9_PCV_correlations.columns = ['rho', 'p_value']
v9_PCV_correlations['p_adj'] = p_adjust(v9_PCV_correlations['p_value'])
v9_PCV_correlations = v9_PCV_correlations.sort_values('p_value')
v9_PCV_correlations.head()

Unnamed: 0,rho,p_value,p_adj
Otu0014,0.356955,0.008059,0.588275
Otu0138,0.304657,0.025096,0.916009
Otu0090,0.281898,0.03891,0.94681
Otu0067,-0.264554,0.053214,0.954329
Otu0006,-0.23597,0.085833,0.954329


Still nothing significant after multiple testing correction. Still most raw significance at 1 year (V9) but nothing consistent enough to get excited about despite the low p-values.