### Calculating vertebrate gut microbiome alpha diversity for the Groussin and the Song datasets. Then run regressions on all vertebrate datasets

### Import libraries

In [1]:
import pandas as pd
import numpy as np
import warnings 
warnings.simplefilter('ignore')
import qiime2 as q2

### Load the  Groussin Data

In [None]:
GenPath='...'

In [2]:
otus = pd.read_csv(GenPath+'/Groussin_data/otuTable_44444uniqSeqs.txt', sep = "\t")


In [3]:
otus = otus.set_index('sample')
otus.head()

Unnamed: 0_level_0,BlackRhino1_1137430*size=2*,ZebraSTL1_1300880*size=1*,Giraffe2_1140929*size=1*,Gazelle3_1140525*size=9*,Giraffe2_1145120*size=2*,Okapi2_1097870*size=1*,Gazelle3_1138247*size=1*,Gazelle3_1120629*size=1*,Gazelle3_1215385*size=4*,ZebraSTL1_1162298*size=3*,...,Chimp1_1231977*size=3*,Chimp1_1245120*size=1*,Orang1_1037687*size=1*,Orang1_1245894*size=3*,Chimp1_1113691*size=1*,GorillaSTL_1193612*size=1*,GorillaSTL_1260380*size=2*,RTLemur_1302590*size=1*,Chimp1_1282036*size=1*,GorillaSTL_1115335*size=1*
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AfElphSD3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Armadillo,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BaboonSTL,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BigHornSD,0,0,0,1,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
BlackBr2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Import the table so it can be saved as an artifact
table = q2.Artifact.import_data("FeatureTable[Frequency]", otus)
table.save(GenPath+'Groussin_data/diversity/feature_table.qza')


### Rarefy the otu table

In [None]:
# Take my otu table that I've already imported and summarize it 
!qiime feature-table summarize \
    --i-table .../Groussin_data/diversity/feature_table.qza \
    --o-visualization .../Groussin_data/diversity/FTsummary.qzv


In [None]:
# Visualized the summary, will be continuing with rarefying to 1266
!qiime feature-table rarefy \
    --i-table .../Groussin_data/diversity/feature_table.qza \
    --p-sampling-depth 1266 \
    --o-rarefied-table .../Groussin_data/diversity/rarefiedFT.qza 

In [None]:
! qiime diversity alpha \
    --i-table .../Groussin_data/diversity/rarefiedFT.qza \
    --p-metric simpson \
    --o-alpha-diversity .../Groussin_data/diversity/simpson.qza


### Repeat this process for the Song et al. data provided by Florent Mazel, which has already been filtered according to the paper **Transmission mode and dispersal traits correlated with host specificity in mammalian gut microbes**

In [8]:
## Import song data 
song_metadata = pd.read_csv(GenPath+'/Flo_data/Song_Metadata.txt', sep = ' ')
song_metadata.head()


Unnamed: 0,SampleID,studyID,deblurred_seqs,pd_5k,shannon_5k,sample_type,preservative,sex,healthy,captive_wild,...,initial_fastq_name,dada2_output_names,Other,duplicated_sample,is_sub_species,Species_name_Corresponding,Time_Tree_curated,Order,Family,Diet
1,X11212.AC673,11212,43494,34.21504159,6.546511187,fecal,etoh,unknown,unknown,wild,...,11212.AC673.gz,11212.AC673.gz_F_filt.fastq,_F_filt.fastq,False,False,unknown,Alouatta_caraya,Primates,Atelidae,herbivore
2,X11212.AC674,11212,40022,39.34137459,6.747977045,fecal,etoh,unknown,unknown,wild,...,11212.AC674.gz,11212.AC674.gz_F_filt.fastq,_F_filt.fastq,False,False,unknown,Alouatta_caraya,Primates,Atelidae,herbivore
3,X11212.AC679,11212,17199,43.65041219,6.803884218,fecal,etoh,unknown,unknown,wild,...,11212.AC679.gz,11212.AC679.gz_F_filt.fastq,_F_filt.fastq,False,False,unknown,Alouatta_caraya,Primates,Atelidae,herbivore
4,X11212.AC680,11212,25815,33.68887383,6.790575961,fecal,etoh,unknown,unknown,wild,...,11212.AC680.gz,11212.AC680.gz_F_filt.fastq,_F_filt.fastq,False,False,unknown,Alouatta_caraya,Primates,Atelidae,herbivore
5,X11212.AC683,11212,26953,40.57833243,6.736080096,fecal,etoh,unknown,unknown,wild,...,11212.AC683.gz,11212.AC683.gz_F_filt.fastq,_F_filt.fastq,False,False,unknown,Alouatta_caraya,Primates,Atelidae,herbivore


In [9]:
song_asvs = pd.read_csv(GenPath+'/Flo_data/Song_ASVs_counts_filt.txt', sep = ' ')
song_asvs = song_asvs.transpose()
song_asvs.head()

Unnamed: 0,ASV_1,ASV_2,ASV_3,ASV_4,ASV_5,ASV_6,ASV_7,ASV_8,ASV_9,ASV_10,...,ASV_52655,ASV_52657,ASV_52709,ASV_52727,ASV_52912,ASV_53147,ASV_53156,ASV_53158,ASV_53160,ASV_53161
11212.AC673,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11212.AC674,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11212.AC679,57,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11212.AC680,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11212.AC683,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
table = q2.Artifact.import_data('FeatureTable[Frequency]', song_asvs)
table.save(GenPath+'/Flo_data/diversity/song_table.qza')

In [None]:
!qiime feature-table summarize \
    --i-table .../Flo_data/diversity/song_table.qza \
    --o-visualization .../Flo_data/diversity/song_summary.qzv

In [None]:
!qiime feature-table rarefy \
    --i-table ...Flo_data/diversity/song_table.qza \
    --p-sampling-depth 5035 \
    --o-rarefied-table /proj.../Flo_data/diversity/song_table_rare.qza \
    

In [None]:
! qiime diversity alpha \
--i-table .../Flo_data/diversity/song_table_rare.qza \
--p-metric simpson \
--o-alpha-diversity .../Flo_data/diversity/song_simpson.qza


In [14]:
song_simpson = pd.read_csv(GenPath+'/Flo_data/diversity/song_simpson/data/alpha-diversity.tsv', sep = '\t')
song_simpson=song_simpson.rename(columns = {'Unnamed: 0':'SampleID'})
song_simpson.head()

Unnamed: 0,SampleID,simpson
0,11212.AC673,0.973291
1,11212.AC674,0.972535
2,11212.AC679,0.97237
3,11212.AC680,0.973233
4,11212.AC683,0.975682


### In our dataset, the sampleIDs in the ASV table don't have 'X' in front, so we will be adding that back in so we can merge with the metadata later

In [15]:
song_simpson['SampleID'] = song_simpson.SampleID.apply(lambda x: 'X'+ x)
song_simpson.head()

Unnamed: 0,SampleID,simpson
0,X11212.AC673,0.973291
1,X11212.AC674,0.972535
2,X11212.AC679,0.97237
3,X11212.AC680,0.973233
4,X11212.AC683,0.975682


In [16]:
song_metadata = pd.read_csv(GenPath+'/Flo_data/Song_Metadata.txt', sep = ' ')[['SampleID', 'ET.BodyMass.Value']]
song_metadata.head()

Unnamed: 0,SampleID,ET.BodyMass.Value
1,X11212.AC673,5862.46
2,X11212.AC674,5862.46
3,X11212.AC679,5862.46
4,X11212.AC680,5862.46
5,X11212.AC683,5862.46


In [17]:
song_diversity = pd.merge(song_simpson, song_metadata, on = 'SampleID')
song_diversity.head()


Unnamed: 0,SampleID,simpson,ET.BodyMass.Value
0,X11212.AC673,0.973291,5862.46
1,X11212.AC674,0.972535,5862.46
2,X11212.AC679,0.97237,5862.46
3,X11212.AC680,0.973233,5862.46
4,X11212.AC683,0.975682,5862.46


In [18]:
# Convert body massses from being presented in grams to kilograms
song_diversity['body_mass_kg'] = song_diversity['ET.BodyMass.Value'].apply(lambda x: x/1000)

In [19]:
song_diversity.head()

Unnamed: 0,SampleID,simpson,ET.BodyMass.Value,body_mass_kg
0,X11212.AC673,0.973291,5862.46,5.86246
1,X11212.AC674,0.972535,5862.46,5.86246
2,X11212.AC679,0.97237,5862.46,5.86246
3,X11212.AC680,0.973233,5862.46,5.86246
4,X11212.AC683,0.975682,5862.46,5.86246


### Groussin data wrangling

In [20]:
# Load in the shannon_diversity data that I generated (note: artifacts were unzipped in the terminal outside of this nb)
grous_simpson = pd.read_csv(GenPath+'Groussin_data/diversity/simpson/data/alpha-diversity.tsv', sep = '\t').rename(columns={'Unnamed: 0': 'SampleID'})
grous_simpson.head()

Unnamed: 0,SampleID,simpson
0,AfElphSD3,0.994633
1,Armadillo,0.991125
2,BaboonSTL,0.954364
3,BigHornSD,0.996792
4,BlackBr2,0.465413


In [21]:
grous_masses = pd.read_csv(GenPath+'/Groussin_data/Groussin_MammallianGutsMassTable.csv')\
    [['SampleID', 'Mean_Mass(kg)']].rename(columns = {'Mean_Mass(kg)': 'body_mass_kg'}).dropna()

grous_masses = grous_masses.drop_duplicates().reset_index(drop=True)

grous_masses

Unnamed: 0,SampleID,body_mass_kg
0,AfElphSD4,4035.0
1,Armadillo,1.2
2,BaboonSTL,18.0
3,BigHornSD,67.13
4,BlackBr2,175.0
5,BlackLemur,2.04
6,BlackRhino1,986.575
7,BushDog1,5.5
8,Callimicos,0.557
9,Capybara,55.0


In [22]:
grous_diversity = pd.merge(grous_simpson, grous_masses, on = 'SampleID')
grous_diversity

Unnamed: 0,SampleID,simpson,body_mass_kg
0,Armadillo,0.991125,1.2
1,BaboonSTL,0.954364,18.0
2,BigHornSD,0.996792,67.13
3,BlackBr2,0.465413,175.0
4,BlackLemur,0.870901,2.04
5,BlackRhino1,0.990151,986.575
6,BushDog1,0.961254,5.5
7,Capybara,0.994032,55.0
8,Chimp1,0.996899,61.065
9,Colobus,0.991692,10.28125


In [23]:
grous_diversity['dataset'] = 'Groussin et al'
song_diversity['dataset'] = 'Song et al'

In [24]:
full_diversity = pd.concat([grous_diversity, song_diversity[['SampleID', 'simpson', 'body_mass_kg', 'dataset']]])
full_diversity.head()

Unnamed: 0,SampleID,simpson,body_mass_kg,dataset
0,Armadillo,0.991125,1.2,Groussin et al
1,BaboonSTL,0.954364,18.0,Groussin et al
2,BigHornSD,0.996792,67.13,Groussin et al
3,BlackBr2,0.465413,175.0,Groussin et al
4,BlackLemur,0.870901,2.04,Groussin et al


### Qiime2 returns Simpson as 1-D and I need 1/D. I am going to generate this value by applying the formula: QiimeSimp = 1 - D, therefore D = 1 - QiimeSimp, so to get 1/D I will do: SimpDom = 1/(1-QiimeSimp)

In [25]:
full_diversity['inv_simpson'] = full_diversity.simpson.apply(lambda x: 1/(1-x))
full_diversity.head()

Unnamed: 0,SampleID,simpson,body_mass_kg,dataset,inv_simpson
0,Armadillo,0.991125,1.2,Groussin et al,112.679696
1,BaboonSTL,0.954364,18.0,Groussin et al,21.912337
2,BigHornSD,0.996792,67.13,Groussin et al,311.69895
3,BlackBr2,0.465413,175.0,Groussin et al,1.870604
4,BlackLemur,0.870901,2.04,Groussin et al,7.746001


In [26]:
full_diversity.SampleID.nunique()

1403

In [27]:
# Save to csv
full_diversity.to_csv(GenPath+'/output/Song_Groussin_diversity.csv')