# Decomp community

In [1]:
# qiime imports
import qiime2
from qiime2 import Artifact, Metadata

# General Tool Imports
import numpy as np
import pandas as pd
import collections
from pickle import load, dump
from IPython.display import display
import warnings

# Plotting Imports
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import seaborn as sns
from statannotations.Annotator import Annotator

import itertools
import scipy
import skbio

from skbio.stats import subsample_counts
from skbio import OrdinationResults
from statsmodels.sandbox.stats.multicomp import multipletests

%matplotlib inline

In [10]:
# this is huge
samples = pd.read_csv('57196_57196_analysis_mapping.txt', sep ='\t',index_col=0)

In [11]:
samples.shape

(951, 416)

In [43]:
samples.groupby(['environmental_package'],dropna=False).agg({'center_project_name': 'count'})

Unnamed: 0_level_0,center_project_name
environmental_package,Unnamed: 1_level_1
host-associated,368
microbial mat/biofilm,2
misc environment,15
not applicable,38
plant-associated,26
sediment,105
soil,117
wastewater/sludge,16
water,37


In [45]:
samples.groupby(['emp500_title'],dropna=False).agg({'center_project_name': 'count'})

Unnamed: 0_level_0,center_project_name
emp500_title,Unnamed: 1_level_1
active layer soil,10
animas watershed,10
australian algae,40
bioreactors for wastewater and anammox,16
biota oil (b2014.1 and b2016.1),1
captive aquatic mammal feces,4
captive bird feces,15
captive herp feces,15
captive terrestrial mammal feces,20
centralia soil,8


In [17]:
reads = pd.read_csv('emp_sample-frequency-detail.csv', index_col=0)

In [18]:
# get samples with >1000 reads
samples = samples[samples.index.isin(reads[reads['0']>1000].index)].copy()

In [22]:
# remove decomp samples
samples = samples.loc[~samples['project_name'].str.contains('decomp')].copy()

In [23]:
samples.shape

(724, 416)

In [15]:
emp_data = Artifact.load('171856_relative_frequency_table.qza').view(pd.DataFrame)

In [20]:
emp_data.shape

(928, 432075)

In [24]:
# remove unwanted samples
emp_data = emp_data[emp_data.index.isin(samples.index)]

In [31]:
# get pmi decomposer data
pmi_decom = pd.read_csv('../ASVs_repseq.txt',sep ='\t',index_col=0)

In [32]:
# get overlap between american gut and pmi asvs
overlap = list(set(pmi_decom.asv.tolist()).intersection(set(emp_data.columns.tolist())))

In [33]:
len(overlap)

22

In [35]:
pmi_decom = pmi_decom.set_index('asv')

### by environment

In [37]:
# mean abundance for each sample group
type_summary = emp_data[overlap].groupby(samples.environmental_package).mean()

In [41]:
# group by asv genus - add up abundance
type_summary.T.groupby(pmi_decom['genus']).sum().T

genus,Acinetobacter,Bacteroides,Ignatzschineria,Oblitimonas,Peptoniphilus,Savagea,Vagococcus,Wohlfahrtiimonas
environmental_package,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
host-associated,0.004023,2e-06,0.0,9.46101e-08,7e-06,0.0,1.474414e-05,0.0
microbial mat/biofilm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
misc environment,0.015969,0.0,0.0,0.0,0.0,0.0,0.0,0.0
not applicable,0.006233,0.0,0.0,0.0,0.0,0.0,0.0,0.0
plant-associated,0.002935,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sediment,0.000122,0.0,0.0,0.0,0.0,0.0,8.096584e-07,0.0
soil,1.5e-05,0.0,7.831712e-07,0.0,0.0,0.0,0.0,0.0
wastewater/sludge,0.000313,0.0,0.0,0.0,0.0,0.0,2.517402e-05,0.0
water,0.000148,0.0,0.0,0.0,2e-06,0.0,0.0,0.0


### by project - might be more descriptive

In [39]:
# group samples by project  - get mean abundance
project_summary = emp_data[overlap].groupby(samples.emp500_title).mean()

In [40]:
# group asvs by genus - add up abundance
project_summary.T.groupby(pmi_decom['genus']).sum().T

genus,Acinetobacter,Bacteroides,Ignatzschineria,Oblitimonas,Peptoniphilus,Savagea,Vagococcus,Wohlfahrtiimonas
emp500_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
active layer soil,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
animas watershed,0.000413,0.0,0.0,0.0,0.0,0.0,0.0,0.0
australian algae,4e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0
bioreactors for wastewater and anammox,0.000313,0.0,0.0,0.0,0.0,0.0,2.517402e-05,0.0
biota oil (b2014.1 and b2016.1),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
captive aquatic mammal feces,4.3e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0
captive bird feces,0.004095,0.0,0.0,0.0,3e-06,0.0,1.564007e-05,0.0
captive herp feces,0.017873,0.0,0.0,0.0,0.0,0.0,1.344267e-05,0.0
captive terrestrial mammal feces,0.005866,0.0,0.0,0.0,0.0,0.0,0.0002337323,0.0
centralia soil,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### how many samples are they found in?

In [51]:
# of samples each asv is found in
pd.concat([pmi_decom.loc[pmi_decom.index.isin(overlap)][['genus','species']],
           emp_data[overlap].groupby(samples['emp500_title']).apply(lambda x: (x>0).sum()).T],axis=1).T.to_csv('/Users/victorianieciecki/Desktop/alsdjf.csv',sep='\t')

age - average count across the group   
decomposer - sum across the genus