# The goal of this notebook is only to generate the diversity metrics for the Arivale and the American Gut cohorts

In [1]:
import pandas as pd
import qiime2 as q2

## Generate diversity metrics for the Arivale cohort

In [2]:
asvs = pd.read_csv(".../arivale/microbiome/16S_processed/asvs.csv")
asvs.head()

Unnamed: 0,id,hash,count
0,22001612560016|GFM-1079-001,c2d3fc09212e226b3a2c3398a1af9436,1285
1,22001612560023|GFM-1079-001,c2d3fc09212e226b3a2c3398a1af9436,2431
2,22001612560062|GFM-1079-001,c2d3fc09212e226b3a2c3398a1af9436,4908
3,22001612560065|GFM-1079-001,c2d3fc09212e226b3a2c3398a1af9436,3944
4,22001612560067|GFM-1079-001,c2d3fc09212e226b3a2c3398a1af9436,5990


In [3]:
# convert asvs to wide format
asvs_wide = asvs.pivot_table(index="id", columns="hash", values="count", fill_value=0, aggfunc="sum")
asvs_wide.head()

hash,00002d83dec0ceabeb1fe4135a2b6ab8,00018b0d514b0f1a47ce15eaa4d247f6,0001eda93a1e6802360bf98488c24869,0002120f916215b077ef0a08d66d2716,00030c11306e21d37b6746c3bde4bf04,00031aa1f9af3c685c27f1f9cb698264,000496f456d876909e556b995d280025,0004b081705699c3c278c0b6d630f14a,00051aaf814992134ca85d5cbcc588c8,0005993ad01aab8760c90b7f99cc6d0e,...,fff8d0f53285381dcd098a6ad052cb75,fff8e7e1b7930fe1667b0fee51cef795,fff9867a3f784827a9c99a2765d05c60,fff9934d86277762cbaa6418cd456f6d,fffa3392a7ab7ae875772f1f004fe140,fffa9d186997056849b930a68dd8b121,fffbae09f8a62b3498565b4fe8838846,fffc51faa053251845837ffc43231b30,fffe4057d41c6ef3b16a370498eb01a9,ffff9eed68c63035d7f56c61d7603794
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
22001612560009|GFM-1079-007,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22001612560012|GFM-1079-011,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22001612560014|GFM-1079-016,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22001612560015|GFM-1079-015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22001612560016|GFM-1079-001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
#confirm number of reads per sample was conserved
all(asvs_wide.sum(axis=1) == asvs.groupby("id")["count"].sum())

True

In [5]:
table = q2.Artifact.import_data("FeatureTable[Frequency]", asvs_wide)
table

<artifact: FeatureTable[Frequency] uuid: 51525ccd-a6da-474b-a299-0434b0c4a93a>

In [None]:
table.save(".../arivale_cohort/arivale_table.qza")

In [None]:
# From the feature table, I need to visualize the data so I can rarefy to the minimum 
! qiime feature-table summarize \
    --i-table .../arivale_cohort/arivale_table.qza \
    --o-visualization .../arivale_cohort/summary.qzv


In [None]:
# rarefy the table 13700, which is rouhgly the sample minimum
! qiime feature-table rarefy --i-table .../arivale_cohort/arivale_table.qza \
    --p-sampling-depth 13703 \
    --o-rarefied-table .../arivale_cohort/rare13703_table.qza


In [None]:
# Get simpson diversity
! qiime diversity alpha \
--i-table .../arivale_cohort/rare13703_table.qza \
--p-metric simpson \
--o-alpha-diversity .../arivale_cohort/simpson_diversity.qza

In [10]:
# In the command line, unzip the simpson_diversity.qza file so we can read in the tsv
simpson = pd.read_csv('.../arivale_cohort/simpson/data/alpha-diversity.tsv', sep = '\t')
simpson.head()


Unnamed: 0.1,Unnamed: 0,simpson
0,22001612560009|GFM-1079-007,0.982643
1,22001612560012|GFM-1079-011,0.965961
2,22001612560014|GFM-1079-016,0.963845
3,22001612560015|GFM-1079-015,0.947365
4,22001612560016|GFM-1079-001,0.983761


## Now do the same for the American Gut cohort

In [11]:
# Save the location of the biom file
american_gut = '.../american_gut_cohort/american_gut/deblur_125nt_no_blooms.biom/'

In [12]:
table = q2.Artifact.import_data("FeatureTable[Frequency]", american_gut)
table

<artifact: FeatureTable[Frequency] uuid: f44bf2ef-eb2c-4125-8389-84aa29672552>

In [13]:
table.save(".../american_gut_cohort/table.qza")

'/proj/gibbons/kramos/emergence_trials/to_publish/american_gut_cohort/table.qza'

In [None]:
# From the feature table, I need to visualize the data so I can rarefy to the minimum 
! qiime feature-table summarize \
    --i-table .../american_gut_cohort/table.qza \
    --o-visualization .../american_gut_cohort/summary.qzv

In [None]:
# rarefy the table 1250
! qiime feature-table rarefy \
    --i-table .../american_gut_cohort/table.qza \
    --p-sampling-depth 1250 \
    --o-rarefied-table .../american_gut_cohort/rare1250_amgut_table.qza



In [None]:
! qiime diversity alpha \
    --i-table .../american_gut_cohort/rare1250_amgut_table.qza \
    --p-metric simpson \
    --o-alpha-diversity .../american_gut_cohort/simpson.qza

In [17]:
# In the command line, unzip the simpson.qza so we can read in the tsv and take a look at it
simpson = pd.read_csv('.../american_gut_cohort/simpson/data/alpha-diversity.tsv', sep = '\t')
simpson.head()
                      

Unnamed: 0.1,Unnamed: 0,simpson
0,10317.000012326,0.940233
1,10317.000054127,0.928506
2,10317.000030973,0.896672
3,10317.00006916,0.922163
4,10317.000059092,0.656485
