# Decomp community

In [1]:
# qiime imports
import qiime2
from qiime2 import Artifact, Metadata

# General Tool Imports
import numpy as np
import pandas as pd
import collections
from pickle import load, dump
from IPython.display import display
import warnings

# Plotting Imports
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import seaborn as sns
from statannotations.Annotator import Annotator

import itertools
import scipy
import skbio

from skbio.stats import subsample_counts
from skbio import OrdinationResults
from statsmodels.sandbox.stats.multicomp import multipletests

%matplotlib inline

In [2]:
# this is huge
# samples = Metadata.load('56935_56935_analysis_mapping.txt').to_dataframe()
samples = pd.read_csv('56935_56935_analysis_mapping.txt', sep ='\t',index_col=0,
                      usecols = ['#SampleID','sample_type', 'ibs', 'diabetes', 'lifestage', 'host_weight', 'ibd', 'cancer', 
                                 'kidney_disease', 'clinical_condition', 'alzheimers', 'host_height', 
                                 'autoimmune', 'sex', 'antibiotic_history', 'host_age', 'lung_disease', 'sibo', 
                                 'cdiff', 'host_body_mass_index', 'age_cat', 'cardiovascular_disease', 
                                 'thyroid', 'diet_type'])

In [3]:
samples['age_cat'].value_counts().to_frame()

Unnamed: 0,age_cat
30s,5693
50s,5681
60s,5339
40s,5299
not applicable,5022
20s,3603
70+,1882
child,1116
teen,683
baby,360


In [4]:
samples.sample_type.value_counts(dropna=False).to_frame()

Unnamed: 0,sample_type
feces,26557
control blank,3643
mouth,2230
skin of hand,610
skin of head,576
stool,216
nares,202
water,147
skin of torso,121
mucus,95


In [5]:
reads = pd.read_csv('AGP-sample-frequency-detail.csv', index_col=0)

In [6]:
# get samples with >1000 reads
samples = samples[samples.index.isin(reads[reads['0']>1000].index)].copy()

In [7]:
samples.shape

(29478, 23)

### Filter down data

In [8]:
samples.loc[samples.age_cat.isin(['20s','30s']), 'age_group'] = '20-30s'
samples.loc[samples.sample_type=='control blank', 'age_group'] = 'control blank'
age_df = samples.drop(samples.loc[samples['host_age'].isin(['not applicable', 'LabControl test', 'nan'])].index)
samples.loc[samples.index.isin(age_df.loc[age_df['host_age'].astype(float)>65.0].index), 'age_group'] = '65+'

In [9]:
samples['age_group'].value_counts().to_frame()

Unnamed: 0,age_group
20-30s,8679
65+,4259
control blank,552


In [10]:
samples.loc[samples['sample_type'].str.contains('skin of ',na=False), 'sample_type'] = 'skin'

In [11]:
samples = samples.loc[(samples['age_group'].isin(['20-30s','65+', 'control blank'])) &
                         (samples['sample_type'].isin(['feces', 'mouth','skin', 'control blank']))].copy()

In [12]:
samples.groupby(['age_group', 'sample_type']).agg({'sample_type': 'count'})

Unnamed: 0_level_0,Unnamed: 1_level_0,sample_type
age_group,sample_type,Unnamed: 2_level_1
20-30s,feces,6956
20-30s,mouth,736
20-30s,skin,609
65+,feces,3566
65+,mouth,361
65+,skin,169
control blank,control blank,552


In [15]:
# export metadata for ancombc
# remove control samples
# samples.loc[samples['age_group']!='control blank'].to_csv('metadata_AGP_filtered.txt', sep='\t')

### get data

In [14]:
data = Artifact.load('170788_relative_frequency_table.qza').view(pd.DataFrame)

In [15]:
data.shape

(31914, 1105456)

### get decomposer data

In [15]:
# get pmi decomposer data
pmi_decom = pd.read_csv('../ASVs_repseq.txt',sep ='\t',index_col=0)

In [16]:
pmi_decom = pmi_decom.set_index('asv')

In [17]:
# get overlap between american gut and pmi asvs
overlap = list(set(pmi_decom.index.tolist()).intersection(set(data.columns.tolist())))

In [18]:
len(overlap)

25

In [19]:
# filter data down to only pmi decomposers
data_pmi = data[overlap].copy()

In [20]:
# only get data for samples > 1000 reads
data_pmi = data_pmi[data_pmi.index.isin(samples.index)].copy()

In [21]:
age_groups_sum = data_pmi.groupby([samples.age_group, samples.sample_type]).mean()

In [22]:
age_groups_sum.T.groupby(pmi_decom['genus']).sum()

age_group,20-30s,20-30s,20-30s,65+,65+,65+,control blank
sample_type,feces,mouth,skin,feces,mouth,skin,control blank
genus,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Acinetobacter,0.001686661,0.00321,0.01119698,0.001961287,0.008575,0.02927339,0.005628907
Bacteroides,7.237315e-08,0.0,0.0,3.509862e-08,0.0,0.0,0.0
Ignatzschineria,2.423888e-08,0.0,7.224659e-06,1.956917e-07,0.0,2.834928e-05,1.116316e-06
Oblitimonas,8.059469e-09,0.0,9.333608e-07,0.0,0.0,0.0002131614,0.0
Peptoniphilus,0.0004423425,0.00056,0.002166594,9.420406e-05,0.001072,0.001830159,0.0009481306
Savagea,0.0,0.0,9.670374e-07,8.490047e-08,0.0,7.828484e-07,0.0
Vagococcus,3.397592e-06,0.0,2.449248e-05,3.132048e-06,0.0,6.310735e-05,2.08195e-06
Wohlfahrtiimonas,2.637813e-07,0.0,0.0,0.0,0.0,5.641531e-06,8.473312e-07


### All data summary

In [None]:
# mean abundance for each sample group
# type_summary = data[overlap].groupby(samples.sample_type).mean()

In [None]:
# group by asv genus - add up abundance
# type_summary.T.groupby(pmi_decom['genus']).sum()

In [None]:
# group samples by age  group - get mean abundance
# age_summary = data_gut[overlap].groupby(samples.age_cat).mean()

In [None]:
# group asvs by genus - add up abundance
# age_summary.T.groupby(pmi_decom['genus']).sum()

age - average count across the group   
decomposer - sum across the genus

### how many samples are these decomposers found in?

In [24]:
# of samples each asv is found in
pd.concat([pmi_decom.loc[pmi_decom.index.isin(overlap)][['genus','species']],
           data_pmi.groupby([samples.age_group, samples.sample_type]).apply(lambda x: (x>0).sum()).T],axis=1)

Unnamed: 0,genus,species,"(20-30s, feces)","(20-30s, mouth)","(20-30s, skin)","(65+, feces)","(65+, mouth)","(65+, skin)","(control blank, control blank)"
TACAGAGGGTGCGAGCGTTAATCGGATTTACTGGGCGTAAAGCGTGCGTAGGCGGCTTTTTAAGTCGGATGTGAAATCCCCGAGCTTAACTTGGGAATTGCATTCGATACTGGGAAGCTAGAGTATGGGAGAGGATGGTAGAATTCCAGG,Acinetobacter,,255,30,262,121,18,99,112
TACAGAGGGTGCGAGCGTTAATCGGATTTACTGGGCGTAAAGCGTGCGTAGGCGGCTTTTTAAGTCGGATGTGAAATCCCTGAGCTTAACTTAGGAATTGCATTCGATACTGGGAAGCTAGAGTATGGGAGAGGATGGTAGAATTCCAGG,Acinetobacter,,215,35,424,124,36,111,55
TACAGAGGGTGCAAGCGTTAATCGGATTTACTGGGCGTAAAGCGTACGTAGGCGGCTAATTAAGTCGGATGTGAAATCCCCGAGCTCAACTTGGGAATTGCATTCGATACTGGTTAGCTAGAGTATGGGAGAGGATGGTAGAATTCCAGG,Acinetobacter,,0,0,22,0,0,0,0
TACAGAGGGTGCGAGCGTTAATCGGATTTACTGGGCGTAAAGCGTACGTAGGCGGCTTTTTAAGTCGGATGTGAAATCCCTGAGCTTAACTTAGGAATTGCATTCGATACTGGGAAGCTAGAGTATGGGAGAGGATGGTAGAATTCCAGG,Acinetobacter,,2,0,8,1,0,3,2
TACAGAGGGTGCAAGCGTTAATCGGATTTACTGGGCGTAAAGCGCGCGTAGGCGGCTAATTAAGTCAAATGTGAAATCCCCGAGCTTAACTTGGGAATTGCATTCGATACTGGTTAGCTAGAGTGTGGGAGAGGATGGTAGAATTCCAGG,Acinetobacter,,95,16,209,44,12,99,57
TACGTAGGTGGCAAGCGTTGTCCGGAATTATTGGGCGTAAAGCGCACGCAGGCGGCCTTCTAAGTCTGATGTGAAATCCCACGGCTTAACCGTGGAAGGTCATTGGAAACTGGAAGGCTTGAGGATAGAAGAGGAAAGTGGAATTCCACG,Savagea,uncultured bacterium,0,0,2,0,0,1,0
TACGTAGGTGGCAAGCGTTGTCCGGAATTATTGGGCGTAAAGCGCACGCAGGCGGCCTTTTAAGTCTGATGTGAAATCCCACGGCTTAACCGTGGAAGGTCATTGGAAACTGGAAGGCTTGAGGATAGAAGAGGAAAGTGGAATTCCACG,Savagea,uncultured bacterium,0,0,0,1,0,0,0
TACGAAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCGCGTAGGTGGTTTGTTAAGTTGGAAGTGAAAGCCCCGGGCTCAACCTGGGAATTGCTTTCAAAACTAGCAGGCTAGAGTACAGTAGAGGGTAGTGGAATTTCCTG,Oblitimonas,Oblitimonas alkaliphila,1,0,1,0,0,3,0
TACGTAGGTGGCAAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTCTTTTAAGTCTGATGTGAAAGCCCTCGGCTCAACCGAGGAAGGTCATTGGAAACTGGGAGACTTGAGTGCAGAAGAGGAGAGTGGAATTCCATG,Vagococcus,,1,0,5,0,0,1,0
TACGTAGGTGGCAAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTCTTTTAAGTCTGATGTGAAAGCCCTCGGCTCAACCGAGGAAGGTCATTGGAAACTGGAGGACTTGAGTGCAGAAGAGGAGAGTGGAATTCCATG,Vagococcus,,7,0,3,1,0,1,1


In [27]:
## percent of samples each ASV is found in
pd.concat([pmi_decom.loc[pmi_decom.index.isin(overlap)][['genus','species']], 
           (data_pmi.groupby([samples.age_group, samples.sample_type]).apply(
    lambda x: (x>0).sum()).T)/(data_pmi.groupby([samples.age_group, samples.sample_type]).count().T)*100], axis=1)

Unnamed: 0,genus,species,"(20-30s, feces)","(20-30s, mouth)","(20-30s, skin)","(65+, feces)","(65+, mouth)","(65+, skin)","(control blank, control blank)"
TACAGAGGGTGCGAGCGTTAATCGGATTTACTGGGCGTAAAGCGTGCGTAGGCGGCTTTTTAAGTCGGATGTGAAATCCCCGAGCTTAACTTGGGAATTGCATTCGATACTGGGAAGCTAGAGTATGGGAGAGGATGGTAGAATTCCAGG,Acinetobacter,,3.6659,4.076087,43.021346,3.393158,4.98615,58.579882,20.289855
TACAGAGGGTGCGAGCGTTAATCGGATTTACTGGGCGTAAAGCGTGCGTAGGCGGCTTTTTAAGTCGGATGTGAAATCCCTGAGCTTAACTTAGGAATTGCATTCGATACTGGGAAGCTAGAGTATGGGAGAGGATGGTAGAATTCCAGG,Acinetobacter,,3.090857,4.755435,69.622332,3.477285,9.972299,65.680473,9.963768
TACAGAGGGTGCAAGCGTTAATCGGATTTACTGGGCGTAAAGCGTACGTAGGCGGCTAATTAAGTCGGATGTGAAATCCCCGAGCTCAACTTGGGAATTGCATTCGATACTGGTTAGCTAGAGTATGGGAGAGGATGGTAGAATTCCAGG,Acinetobacter,,0.0,0.0,3.612479,0.0,0.0,0.0,0.0
TACAGAGGGTGCGAGCGTTAATCGGATTTACTGGGCGTAAAGCGTACGTAGGCGGCTTTTTAAGTCGGATGTGAAATCCCTGAGCTTAACTTAGGAATTGCATTCGATACTGGGAAGCTAGAGTATGGGAGAGGATGGTAGAATTCCAGG,Acinetobacter,,0.028752,0.0,1.313629,0.028043,0.0,1.775148,0.362319
TACAGAGGGTGCAAGCGTTAATCGGATTTACTGGGCGTAAAGCGCGCGTAGGCGGCTAATTAAGTCAAATGTGAAATCCCCGAGCTTAACTTGGGAATTGCATTCGATACTGGTTAGCTAGAGTGTGGGAGAGGATGGTAGAATTCCAGG,Acinetobacter,,1.365727,2.173913,34.318555,1.233875,3.3241,58.579882,10.326087
TACGTAGGTGGCAAGCGTTGTCCGGAATTATTGGGCGTAAAGCGCACGCAGGCGGCCTTCTAAGTCTGATGTGAAATCCCACGGCTTAACCGTGGAAGGTCATTGGAAACTGGAAGGCTTGAGGATAGAAGAGGAAAGTGGAATTCCACG,Savagea,uncultured bacterium,0.0,0.0,0.328407,0.0,0.0,0.591716,0.0
TACGTAGGTGGCAAGCGTTGTCCGGAATTATTGGGCGTAAAGCGCACGCAGGCGGCCTTTTAAGTCTGATGTGAAATCCCACGGCTTAACCGTGGAAGGTCATTGGAAACTGGAAGGCTTGAGGATAGAAGAGGAAAGTGGAATTCCACG,Savagea,uncultured bacterium,0.0,0.0,0.0,0.028043,0.0,0.0,0.0
TACGAAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCGCGTAGGTGGTTTGTTAAGTTGGAAGTGAAAGCCCCGGGCTCAACCTGGGAATTGCTTTCAAAACTAGCAGGCTAGAGTACAGTAGAGGGTAGTGGAATTTCCTG,Oblitimonas,Oblitimonas alkaliphila,0.014376,0.0,0.164204,0.0,0.0,1.775148,0.0
TACGTAGGTGGCAAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTCTTTTAAGTCTGATGTGAAAGCCCTCGGCTCAACCGAGGAAGGTCATTGGAAACTGGGAGACTTGAGTGCAGAAGAGGAGAGTGGAATTCCATG,Vagococcus,,0.014376,0.0,0.821018,0.0,0.0,0.591716,0.0
TACGTAGGTGGCAAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTCTTTTAAGTCTGATGTGAAAGCCCTCGGCTCAACCGAGGAAGGTCATTGGAAACTGGAGGACTTGAGTGCAGAAGAGGAGAGTGGAATTCCATG,Vagococcus,,0.100633,0.0,0.492611,0.028043,0.0,0.591716,0.181159


## Export Filtered Sample Dataframe for AncomBC

use filtered sample metadata to filter frequency table:
  
qiime feature-table filter-samples \
  --i-table table.qza \
  --m-metadata-file samples-to-keep.tsv \
  --o-filtered-table id-filtered-table.qza
  
OR    

qiime feature-table filter-samples \
  --i-table table.qza \
  --m-metadata-file sample-metadata.tsv \
  --p-where "[subject]='subject-1'" \
  --o-filtered-table subject-1-filtered-table.qza

make new data tables for 
- skin
- mouth  

then run ancombc on age_group