- Filter out low abundance pathways
- Knock out human pathways


In [1]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from biom import load_table
from skbio.stats.composition import ancom, closure
from statsmodels.sandbox.stats.multicomp import multipletests
import warnings
import seaborn as sns
from scipy.stats import kruskal
warnings.filterwarnings("ignore")
%matplotlib inline

These tables were obtained through 10k rarified OTU tables on fecal samples.  
The otu table was then normalized by copy number and metagenomic abundances were predicted from the normalized OTU table

In [2]:
table = load_table('data/ag_10k_fecal_pathway3.biom')
metadata = pd.read_table('data/ag_fecal.txt', index_col=0)
singles = pd.read_table('data/single_ids_10k.txt', index_col=0)

Now, we'll declare a set of pathways as artifacts.  Just because they are determined to have a match in the KEGG database, doesn't mean that they are a true match.  Some of the KEGGs below are human genes.

In [3]:
 artifacts = {  'Hypertrophic cardiomyopathy (HCM)',              
                'Antigen processing and presentation',
                'Bladder cancer','Amyotrophic lateral sclerosis (ALS)',
                'Photosynthesis', 'Adipocytokine signaling pathway',
                'MAPK signaling pathway - yeast','Prostate cancer',
                'NOD-like receptor signaling pathway',
                'Progesterone-mediated oocyte maturation',
                'Type I diabetes mellitus', 
                'Plant-pathogen interaction',
                'Ribosome biogenesis in eukaryotes',
                'Type II diabetes mellitus',
                'RIG-I-like receptor signaling pathway',
                'Cardiac muscle contraction',
                'Prion diseases',
                'Amyotrophic lateral sclerosis',
                'African trypanosomiasis',r"Huntington's disease",
                'Vibrio cholerae pathogenic cycle',
                'Retinol metabolism',
                'Salmonella infection',
                'Cardiac muscle contraction',
                'Type I diabetes mellitus'} 

In [4]:
read_filter = lambda val, id_, md: sum(val) > 1000 
sparse_filter = lambda val, id_, md: np.count_nonzero(val) / len(val) > 0.25 
pathway_filter = lambda val, id_, md: id_ not in artifacts 
table = table.filter(pathway_filter, axis='observation')
table = table.filter(read_filter, axis='observation')
table = table.filter(sparse_filter, axis='observation') 
 

We'll also convert all of the abundances to proportions.  Zeros will be handled by adding 1 to everything.

In [5]:
mat = np.array(table.matrix_data.todense()).T
df = pd.DataFrame(closure(mat+1), 
                  index=table.ids(axis='sample'), 
                  columns=table.ids(axis='observation'))

We'll only isolate the single samples, to avoid any dependence issues when running statistical tests.

Since the `TYPES_OF_PLANTS` was dropped in later studies, we'll only look at the participants that answered this question.

In [6]:
metadata = metadata.loc[singles.index, :]
metadata = metadata.loc[metadata['TYPES_OF_PLANTS'] != 'Unknown', :]
metadata = metadata.loc[pd.notnull(metadata['TYPES_OF_PLANTS']), :]

In [7]:
df = df.loc[list(set(df.index) & set(metadata.index)), :]
metadata = metadata.loc[list(set(df.index) & set(metadata.index)), :]

In [8]:
# res = ancom(df+1, metadata['TYPES_OF_PLANTS'])

In [9]:
f = lambda x : kruskal(*[x.loc[i] for i in metadata.groupby('TYPES_OF_PLANTS').groups.values()])
res = df.apply(f, axis=0)
reject, pvalues, _, _ = multipletests([x[1] for x in res], alpha=0.05)
res = res[reject]


We'll extract only the significant OTUs detected by ANCOM.  Then we'll log transform the data and subtract out the means.  It'll make it easier to visualize on the heatmaps.

In [10]:
sig = res[res['reject']==True]
df = df.loc[:, reject]
df = np.log(df)
df = df - df.mean(axis=0)
df = pd.merge(df, pd.DataFrame(metadata['TYPES_OF_PLANTS']), left_index=True, right_index=True)

KeyError: 'reject'

In [None]:
df.groupby('TYPES_OF_PLANTS').mean() 

In [None]:
grps = df.groupby('TYPES_OF_PLANTS')
subgrps = (grps.mean() / grps.std())[sig.index]

In [None]:
labs = ['Less than 5', '6 to 10', '11 to 20', '21 to 30', 'More than 30']
subgrps = subgrps.reindex_axis(labs)
subgrps = subgrps.T

In [None]:
subgrps

In [None]:
fig, ax = plt.subplots()
heatmap = ax.pcolor(subgrps, cmap=plt.cm.RdBu)

# put the major ticks at the middle of each cell
_ = ax.set_xticks(np.arange(subgrps.shape[1])+0.5, minor=False)
_ = ax.set_yticks(np.arange(subgrps.shape[0])+0.5, minor=False)

_ = ax.set_xticklabels(subgrps.columns, minor=False)
_ = ax.set_yticklabels(subgrps.index, minor=False)

Another way to visualize this is through boxplots.  This will give us a better idea about the variance between groups.

In [11]:
df = pd.DataFrame(np.log(closure(mat+1)), 
                  index=table.ids(axis='sample'), 
                  columns=table.ids(axis='observation'))
#df = df.loc[:, sig.index]
df = pd.merge(df, pd.DataFrame(metadata['TYPES_OF_PLANTS']), left_index=True, right_index=True)
mdf = pd.melt(df, id_vars='TYPES_OF_PLANTS', var_name='Pathway', value_name='Abundance')

NameError: name 'sig' is not defined

If the colors need to be changed, a dictionary can be passed into the palette parameter in the `boxplot` function.

In [None]:
font = {'family' : 'normal',
        'size'   : 28}

matplotlib.rc('font', **font)
fig, ax = plt.subplots(figsize=(20, 20))
_ = sns.boxplot(hue='TYPES_OF_PLANTS', x='Abundance', y='Pathway', data=mdf, 
                hue_order=labs,
                orient='h', ax=ax, notch=True)
ax.set_xlabel('Log % Abundance', fontsize=24)
ax.set_ylabel('Level 3 KEGG Pathway', fontsize=24)