- Filter out low abundance pathways
- Knock out human pathways


In [1]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from biom import load_table
from skbio.stats.composition import ancom, closure
from statsmodels.sandbox.stats.multicomp import multipletests
import warnings
import seaborn as sns
from scipy.stats import kruskal
warnings.filterwarnings("ignore")
%matplotlib inline

ImportError: No module named composition

These tables were obtained through 10k rarified OTU tables on fecal samples.  
The otu table was then normalized by copy number and metagenomic abundances were predicted from the normalized OTU table

In [None]:
table = load_table('data/ag_10k_fecal_pathway3.biom')
metadata = pd.read_table('data/ag_fecal.txt', index_col=0)
singles = pd.read_table('data/single_ids_10k.txt', index_col=0)

Now, we'll declare a set of pathways as artifacts.  Just because they are determined to have a match in the KEGG database, doesn't mean that they are a true match.  Some of the KEGGs below are human genes.

In [2]:
 artifacts = {  'Hypertrophic cardiomyopathy (HCM)',              
                'Antigen processing and presentation',
                'Bladder cancer','Amyotrophic lateral sclerosis (ALS)',
                'Photosynthesis', 'Adipocytokine signaling pathway',
                'MAPK signaling pathway - yeast','Prostate cancer',
                'NOD-like receptor signaling pathway',
                'Progesterone-mediated oocyte maturation',
                'Type I diabetes mellitus', 
                'Plant-pathogen interaction',
                'Ribosome biogenesis in eukaryotes',
                'Type II diabetes mellitus',
                'RIG-I-like receptor signaling pathway',
                'Cardiac muscle contraction',
                'Prion diseases',
                'Amyotrophic lateral sclerosis',
                'African trypanosomiasis',r"Huntington's disease",
                'Vibrio cholerae pathogenic cycle',
                'Retinol metabolism',
                'Salmonella infection',
                'Cardiac muscle contraction',
                'Type I diabetes mellitus'} 

In [4]:
read_filter = lambda val, id_, md: sum(val) > 1000 
sparse_filter = lambda val, id_, md: np.count_nonzero(val) / len(val) > 0.25 
pathway_filter = lambda val, id_, md: id_ not in artifacts 
table = table.filter(pathway_filter, axis='observation')
table = table.filter(read_filter, axis='observation')
table = table.filter(sparse_filter, axis='observation') 
 

We'll also convert all of the abundances to proportions.  Zeros will be handled by adding 1 to everything.

In [5]:
mat = np.array(table.matrix_data.todense()).T
df = pd.DataFrame(closure(mat+1), 
                  index=table.ids(axis='sample'), 
                  columns=table.ids(axis='observation'))

We'll only isolate the single samples, to avoid any dependence issues when running statistical tests.

Since the `TYPES_OF_PLANTS` was dropped in later studies, we'll only look at the participants that answered this question.

In [6]:
metadata = metadata.loc[singles.index, :]
metadata = metadata.loc[metadata['TYPES_OF_PLANTS'] != 'Unknown', :]
metadata = metadata.loc[pd.notnull(metadata['TYPES_OF_PLANTS']), :]

In [7]:
df = df.loc[list(set(df.index) & set(metadata.index)), :]
metadata = metadata.loc[list(set(df.index) & set(metadata.index)), :]

In [8]:
# res = ancom(df+1, metadata['TYPES_OF_PLANTS'])

In [11]:
df.shape

(2170, 241)

In [12]:
f = lambda x : kruskal(*[x.loc[i] for i in metadata.groupby('TYPES_OF_PLANTS').groups.values()])
res = df.apply(f, axis=0)
reject, pvalues, _, _ = multipletests([x[1] for x in res], alpha=0.05)
res = res[reject]

We'll extract only the significant OTUs detected by ANCOM.  Then we'll log transform the data and subtract out the means.  It'll make it easier to visualize on the heatmaps.

In [13]:
sum(reject)

20

In [14]:
#sig = res[res['reject']==True]
df = df.loc[:, reject]
df = np.log(df)
df = df - df.mean(axis=0)
df = pd.merge(df, pd.DataFrame(metadata['TYPES_OF_PLANTS']), left_index=True, right_index=True)

In [15]:
df.shape

(2170, 21)

In [16]:
df.groupby('TYPES_OF_PLANTS').mean() 

Unnamed: 0_level_0,Aminoacyl-tRNA biosynthesis,Ascorbate and aldarate metabolism,Bacterial chemotaxis,Bacterial motility proteins,Bacterial toxins,Biotin metabolism,Chromosome,Cytoskeleton proteins,D-Alanine metabolism,Flagellar assembly,Germination,"Glycine, serine and threonine metabolism",Inorganic ion transport and metabolism,Lipoic acid metabolism,Other ion-coupled transporters,Others,Phosphatidylinositol signaling system,Photosynthesis proteins,Protein folding and associated processing,Ribosome Biogenesis
TYPES_OF_PLANTS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
11 to 20,-0.000387,-0.009587,0.023501,0.028655,-0.008415,-0.001378,0.001221,0.002638,-6e-05,0.0337,0.011252,-0.000968,-0.00212,-0.02438,0.000305,-0.000658,-0.005353,0.000695,-0.001112,0.00037
21 to 30,0.003942,-0.000177,-0.008451,-0.013596,0.004539,0.005482,-0.000189,0.008643,-0.005923,-0.012746,0.031977,-0.000497,-0.013011,0.036866,-0.005666,-0.00274,0.00227,0.0047,-0.00503,-0.000293
6 to 10,-0.005159,0.022369,-0.032154,-0.034554,0.007026,0.006511,-0.00275,-0.009553,-0.004716,-0.041401,-0.043055,0.003493,0.01544,0.031331,0.001692,0.004006,0.011701,-0.012548,0.009234,-0.003075
Less than 5,-0.019087,0.070182,-0.095169,-0.091275,0.022948,0.016557,-0.014488,-0.047938,-0.006582,-0.134589,-0.193176,0.007717,0.062967,0.085175,0.024058,0.018772,0.035473,-0.032266,0.023069,-0.016097
More than 30,0.014307,-0.055197,0.068589,0.066969,-0.013256,-0.023903,0.010157,0.025045,0.018973,0.090565,0.110754,-0.007281,-0.038023,-0.097006,-0.009024,-0.011801,-0.030608,0.030269,-0.018696,0.013347


In [17]:
grps = df.groupby('TYPES_OF_PLANTS')
subgrps = (grps.mean() / grps.std())

In [18]:
labs = ['Less than 5', '6 to 10', '11 to 20', '21 to 30', 'More than 30']
subgrps = subgrps.reindex_axis(labs)
subgrps = subgrps.T

In [19]:
subgrps

TYPES_OF_PLANTS,Less than 5,6 to 10,11 to 20,21 to 30,More than 30
Aminoacyl-tRNA biosynthesis,-0.185367,-0.050826,-0.003977,0.04591,0.147931
Ascorbate and aldarate metabolism,0.260098,0.094207,-0.037934,-0.000765,-0.221269
Bacterial chemotaxis,-0.225383,-0.086868,0.069084,-0.024551,0.21307
Bacterial motility proteins,-0.1876,-0.079662,0.072802,-0.03358,0.18071
Bacterial toxins,0.103803,0.039746,-0.055175,0.030213,-0.089864
Biotin metabolism,0.101184,0.059939,-0.011948,0.04972,-0.22248
Chromosome,-0.23579,-0.046194,0.020967,-0.003742,0.183208
Cytoskeleton proteins,-0.234723,-0.055171,0.015215,0.060708,0.163233
D-Alanine metabolism,-0.055234,-0.047119,-0.000617,-0.071222,0.221367
Flagellar assembly,-0.21939,-0.0813,0.071896,-0.027098,0.208004


In [None]:
fig, ax = plt.subplots()
heatmap = ax.pcolor(subgrps, cmap=plt.cm.RdBu)

# put the major ticks at the middle of each cell
_ = ax.set_xticks(np.arange(subgrps.shape[1])+0.5, minor=False)
_ = ax.set_yticks(np.arange(subgrps.shape[0])+0.5, minor=False)

_ = ax.set_xticklabels(subgrps.columns, minor=False)
_ = ax.set_yticklabels(subgrps.index, minor=False)

Another way to visualize this is through boxplots.  This will give us a better idea about the variance between groups.

In [None]:
df = pd.DataFrame(np.log(closure(mat+1)), 
                  index=table.ids(axis='sample'), 
                  columns=table.ids(axis='observation'))
#df = df.loc[:, sig.index]

df = pd.merge(df, pd.DataFrame(metadata['TYPES_OF_PLANTS']), left_index=True, right_index=True)
mdf = pd.melt(df, id_vars='TYPES_OF_PLANTS', var_name='Pathway', value_name='Abundance') 

If the colors need to be changed, a dictionary can be passed into the palette parameter in the `boxplot` function.

In [None]:
font = {'family' : 'normal',
        'size'   : 28}

matplotlib.rc('font', **font)
fig, ax = plt.subplots(figsize=(20, 20))
_ = sns.boxplot(hue='TYPES_OF_PLANTS', x='Abundance', y='Pathway', data=mdf, 
                hue_order=labs,
                orient='h', ax=ax, notch=True)
ax.set_xlabel('Log % Abundance', fontsize=24)
ax.set_ylabel('Level 3 KEGG Pathway', fontsize=24)

In [None]:
df.shape