# Decomp community

In [42]:
# qiime imports
import qiime2
from qiime2 import Artifact, Metadata

# General Tool Imports
import numpy as np
import pandas as pd
import collections
from pickle import load, dump
from IPython.display import display
import warnings

# Plotting Imports
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import seaborn as sns
# from statannotations.Annotator import Annotator

import itertools
import scipy
import skbio

from skbio.stats import subsample_counts
from skbio import OrdinationResults
from statsmodels.sandbox.stats.multicomp import multipletests

%matplotlib inline

In [43]:
samples = Metadata.load('SraRunTable.txt').to_dataframe()

In [44]:
decay = samples.loc[samples['decomp_stage']=='active_decay'].copy()

In [45]:
data = Artifact.load('table_r1.qza').view(pd.DataFrame)

In [46]:
decay['reads'] = data.sum(axis=1)

In [47]:
decay[['reads']]

Unnamed: 0_level_0,reads
sample-id,Unnamed: 1_level_1
0COWd025.30,36379.0
0COWd024.30,3423.0
0COWd023.30,76118.0
0COWd027.30,58195.0
0COWd026.30,60343.0
0COWd028.30,52440.0
0COWd029.30,40682.0
0COWd030.30,86762.0
0COWd031.30,58603.0
0COWd032.30,7857.0


I'm just not at all sure what the .20 vs .30 means and which we should use...

In [48]:
# get samples with >1000 reads
decay = decay.loc[decay['reads']>1000].copy()

In [49]:
decay.groupby(['decomp_stage'], dropna=False).count()['Run'].to_frame()

Unnamed: 0_level_0,Run
decomp_stage,Unnamed: 1_level_1
active_decay,42


In [50]:
# only get data for samples > 1000 reads
data = data[data.index.isin(decay.index)]

In [51]:
# remove zero columns
data = data.loc[:, (data.sum(axis=0) != 0)]

In [52]:
# get seqs
seqs = Artifact.load('rep_seqs_r1.qza').view(qiime2.Metadata).to_dataframe()

In [53]:
seqs.shape

(6544, 1)

In [54]:
# remove seqs with no reads
seqs = seqs.loc[seqs.index.isin(data.columns)]

In [55]:
seqs.shape

(2426, 1)

### Import Decomposers

In [56]:
pmi_decom = pd.read_csv('../ASVs_repseq.txt',sep ='\t',index_col=0)

In [57]:
pmi_decom.groupby('genus').count()['150_asv'].to_frame()

Unnamed: 0_level_0,150_asv
genus,Unnamed: 1_level_1
Acinetobacter,5
Bacteroides,9
Ignatzschineria,4
Oblitimonas,1
Peptoniphilus,5
Savagea,3
Vagococcus,5
Wohlfahrtiimonas,3


In [58]:
# pmi decomposers found in cow data
cow_decomp = list(set(pmi_decom.index.tolist()).intersection(set(seqs.index.tolist())))

In [59]:
print("{} ASVs were found in cow data".format(len(cow_decomp)))

26 ASVs were found in cow data


## Collapse reads data table on 150 bp ASVs
sum reads of asvs with same 150 bp sequence

In [60]:
# make ra table
ra = data.apply(lambda x: x / x.sum(), axis=1)

In [61]:
## average of each asv across all the active decomp samples
ra_cow = ra[cow_decomp].groupby(decay['decomp_stage']).mean()

In [62]:
## averages summed by genus
ra_cow.T.groupby(pmi_decom['genus']).sum()

decomp_stage,active_decay
genus,Unnamed: 1_level_1
Acinetobacter,8e-06
Bacteroides,0.008243
Ignatzschineria,0.036005
Oblitimonas,3.1e-05
Peptoniphilus,0.00062
Savagea,0.047552
Vagococcus,0.001299
Wohlfahrtiimonas,0.001433


### how many samples are these decomposer asvs found in?  

In [63]:
# of samples each asv is found in
pd.concat([pmi_decom.loc[pmi_decom.index.isin(cow_decomp)][['genus','species']],
           ra[cow_decomp].groupby(decay['decomp_stage']).apply(lambda x: (x>0).sum()).T],axis=1)

Unnamed: 0,genus,species,active_decay
9acc238746a1f2aa7745a0b5720c4eac,Acinetobacter,,1
6e8986f8088b452f964e7968bb8bca87,Acinetobacter,,3
174b6959ecfedfee56c9daf6ffa45d2b,Savagea,uncultured bacterium,36
7567be5b5c8b7a16dbb1a84f6b46d965,Savagea,uncultured bacterium,2
8a8c381201ca5a905366fdf242c076db,Savagea,uncultured bacterium,2
837e0d796b199b8c9b462d97ad3c5599,Oblitimonas,Oblitimonas alkaliphila,6
b08c80963ce4a0df0518836d2da209ce,Vagococcus,,1
8e87132c368c4f56dd114b1cab5f59a6,Vagococcus,,15
cceb21819b328ad472f3e5fa20b3cbd9,Vagococcus,,9
92460066faddd83314cbc2348bf4fd29,Vagococcus,,12


In [64]:
## percent of samples each ASV is found in
pd.concat([pmi_decom.loc[pmi_decom.index.isin(cow_decomp)][['genus','species']], (data[cow_decomp].groupby(decay['decomp_stage']).apply(
    lambda x: (x>0).sum()).T)/(data[cow_decomp].groupby(decay['decomp_stage']).count().T)*100], axis=1)

Unnamed: 0,genus,species,active_decay
9acc238746a1f2aa7745a0b5720c4eac,Acinetobacter,,2.380952
6e8986f8088b452f964e7968bb8bca87,Acinetobacter,,7.142857
174b6959ecfedfee56c9daf6ffa45d2b,Savagea,uncultured bacterium,85.714286
7567be5b5c8b7a16dbb1a84f6b46d965,Savagea,uncultured bacterium,4.761905
8a8c381201ca5a905366fdf242c076db,Savagea,uncultured bacterium,4.761905
837e0d796b199b8c9b462d97ad3c5599,Oblitimonas,Oblitimonas alkaliphila,14.285714
b08c80963ce4a0df0518836d2da209ce,Vagococcus,,2.380952
8e87132c368c4f56dd114b1cab5f59a6,Vagococcus,,35.714286
cceb21819b328ad472f3e5fa20b3cbd9,Vagococcus,,21.428571
92460066faddd83314cbc2348bf4fd29,Vagococcus,,28.571429


### look for other Wohlfahrtiimonas ASVs

In [65]:
taxa = Artifact.load('taxonomy_cow.qza').view(pd.DataFrame)

In [66]:
taxa

Unnamed: 0_level_0,Taxon,Confidence
Feature ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1ee2e27913ab9da1f90335edc08cdd41,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,0.999993574820448
a4e3c4670582850046c5b8861bb2b55d,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,0.9949952317699843
31ebb433ebf0b3e83259d826875b4438,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,0.9999999504926806
491d5271c1aae8cf1c7838972148c36f,d__Bacteria; p__Firmicutes; c__Bacilli; o__Bac...,0.9405262599343626
3e45cb330ef8321eb9c6da765103f82e,d__Bacteria; p__Bacteroidota; c__Bacteroidia; ...,0.9997986751531613
...,...,...
272fd5bb7d45eea7228a18bcb514bcc8,d__Bacteria; p__Actinobacteriota; c__Actinobac...,0.991352683918789
c79f4725cd4d3aee747768baec21ab77,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,0.9893301558754087
1986030e3c5cd5df6d869054bc96e536,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,0.8804667917601529
3f12521ca90170cca7506c92e8ad8979,d__Bacteria; p__Firmicutes; c__Clostridia; o__...,0.7758298232771055


In [67]:
taxa['asv'] = seqs['Sequence']

In [71]:
taxa.loc[taxa.Taxon.str.contains('wohlf', case=False)].Taxon.values

array(['d__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Cardiobacteriales; f__Wohlfahrtiimonadaceae; g__Ignatzschineria; s__Ignatzschineria_sp.',
       'd__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Cardiobacteriales; f__Wohlfahrtiimonadaceae; g__Ignatzschineria',
       'd__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Cardiobacteriales; f__Wohlfahrtiimonadaceae; g__Wohlfahrtiimonas; s__Wohlfahrtiimonas_chitiniclastica',
       'd__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Cardiobacteriales; f__Wohlfahrtiimonadaceae; g__Koukoulia; s__Koukoulia_aurantiaca',
       'd__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Cardiobacteriales; f__Wohlfahrtiimonadaceae',
       'd__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Cardiobacteriales; f__Wohlfahrtiimonadaceae; g__Ignatzschineria; s__Ignatzschineria_sp.',
       'd__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Cardiobacteriales; f__Wohlfahrtiimonadaceae; g__Ig

In [94]:
#anything with Wohl in name
wohl = taxa.loc[taxa.Taxon.str.contains('Wohlf', case=False)].copy()

In [95]:
wohl.Taxon.values

array(['d__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Cardiobacteriales; f__Wohlfahrtiimonadaceae; g__Ignatzschineria; s__Ignatzschineria_sp.',
       'd__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Cardiobacteriales; f__Wohlfahrtiimonadaceae; g__Ignatzschineria',
       'd__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Cardiobacteriales; f__Wohlfahrtiimonadaceae; g__Wohlfahrtiimonas; s__Wohlfahrtiimonas_chitiniclastica',
       'd__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Cardiobacteriales; f__Wohlfahrtiimonadaceae; g__Koukoulia; s__Koukoulia_aurantiaca',
       'd__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Cardiobacteriales; f__Wohlfahrtiimonadaceae',
       'd__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Cardiobacteriales; f__Wohlfahrtiimonadaceae; g__Ignatzschineria; s__Ignatzschineria_sp.',
       'd__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Cardiobacteriales; f__Wohlfahrtiimonadaceae; g__Ig

In [96]:
# remove anything that has a g__Ignatzschineria classification
wohl = wohl.loc[~wohl.Taxon.str.contains('g__Ignat')]

In [97]:
wohl.Taxon.values

array(['d__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Cardiobacteriales; f__Wohlfahrtiimonadaceae; g__Wohlfahrtiimonas; s__Wohlfahrtiimonas_chitiniclastica',
       'd__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Cardiobacteriales; f__Wohlfahrtiimonadaceae; g__Koukoulia; s__Koukoulia_aurantiaca',
       'd__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Cardiobacteriales; f__Wohlfahrtiimonadaceae',
       'd__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Cardiobacteriales; f__Wohlfahrtiimonadaceae; g__Wohlfahrtiimonas; s__Wohlfahrtiimonas_chitiniclastica',
       'd__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Cardiobacteriales; f__Wohlfahrtiimonadaceae; g__Koukoulia; s__Koukoulia_aurantiaca'],
      dtype=object)

In [98]:
# all wohl in dataset
cow_wohl = list(set(data.columns.tolist()).intersection(set(wohl.index)))

In [99]:
# get new wohl asvs - remove those found in pmi decomposers
other_wohl = list(set(cow_wohl).difference(set(pmi_decom.index)))

In [100]:
other_wohl

['2852607842f0cae1ef1c11dc0de80c3d',
 '6a8bf8de9e5fd208b17866cb3323520a',
 'd2fe79722e176a4eea6e323ee5dc2358',
 '67056d02c33331ac5e790f715d5dfdee']

In [101]:
pmi_decom.loc[pmi_decom.index.isin(other_wohl)]

Unnamed: 0_level_0,taxonomy,150_asv,100_bp,kingdom,phylum,class,order,family,genus,species
#OTU ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1


In [102]:
data[other_wohl].sum()

2852607842f0cae1ef1c11dc0de80c3d      28.0
6a8bf8de9e5fd208b17866cb3323520a    3961.0
d2fe79722e176a4eea6e323ee5dc2358     254.0
67056d02c33331ac5e790f715d5dfdee      82.0
dtype: float64

In [103]:
## average of each asv across all the active decomp samples
ra_wohl = ra[other_wohl].groupby(decay['decomp_stage']).mean()

In [104]:
ra_wohl.sum(axis=1).to_frame(name='Wohlfahrtiimonadaceae')

Unnamed: 0_level_0,Wohlfahrtiimonadaceae
decomp_stage,Unnamed: 1_level_1
active_decay,0.001488


In [105]:
len(seqs['Sequence'][0])

150