In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import csv
import janitor
import numpy as np
import rpy2.robjects as robjects

* Use `renv::restore()` to install packages recorded in the lockfile.

by .GlobalEnv when processing object ‘plot0’

by .GlobalEnv when processing object ‘p4’



In [2]:
metadata = pd.read_csv("diabimmune_metadata.csv")

In [3]:
metadata = metadata.transpose()

In [4]:
metadata.columns = metadata.iloc[0]
metadata.drop(["Unnamed: 0"], inplace=True)

In [5]:
metadata.head()

Unnamed: 0,subjectID,SampleID,age_at_collection,collection_month,delivery,gest_time,gender,country,Exclusive_breast_feeding,Breast_feeding_end,...,allergy_dog,allergy_birch,allergy_timothy,gid_wgs,mgx_reads,mgx_pool,mgx_reads_filtered,read_count_16S,sequencing_PDO_16S,gid_16s
5,E002338,3101193,304,10,vaginal,285,Female,FIN,False,False,...,False,False,False,G80541,28876314,Plate 7,28.568962,15870,PDO-4356,G75694
6,E002338,3107294,493,16,vaginal,285,Female,FIN,False,True,...,False,False,False,G80537,29615868,Plate 7,29.542272,13609,PDO-4356,G75855
7,E002338,3113022,852,28,vaginal,285,Female,FIN,False,True,...,False,False,False,G80322,30478184,Plate 8,30.41078,34620,PDO-4157,G73882
8,E002338,3107293,399,13,vaginal,285,Female,FIN,False,False,...,False,False,False,G80513,28503488,Plate 7,28.360762,7337,PDO-4356,G75849
15,E002338,3101190,212,7,vaginal,285,Female,FIN,False,False,...,False,False,False,,23928698,Diab Plate 9,23.824676,18931,PDO-4356,G75788


In [6]:
# map gid_16s to gid_wgs
# keys are 16S ids, values are mgx ids

id_mapping = pd.Series(metadata.gid_wgs.values,index=metadata.gid_16s).to_dict()

## Importing metagenomics data

### Genus 

In [7]:
# read in mgx data
mgx_genus = pd.read_csv("diabimmune_mgx_genus.csv")
mgx_genus.rename(columns={'taxname': 'taxa'}, inplace=True)

In [8]:
# mgx_genus = mgx[mgx['taxa'].str.contains("\|g__")] # keep genera
# mgx_genus = mgx_genus[~mgx_genus['taxa'].str.contains("\|s__")] # keep species
# mgx_genus["taxa"] = mgx_genus['taxa'].str.split("\|g__").str[-1]
# mgx_genus["taxa"] = mgx_genus['taxa'].str.split("\|s__").str[0]#

In [9]:
# remove taxa that are unclassified or have no name
# "_unclassified"
# "_noname"
mgx_genus = mgx_genus[~mgx_genus.taxa.str.contains("_unclassified")]
mgx_genus = mgx_genus[~mgx_genus.taxa.str.contains("_noname")]
mgx_genus = mgx_genus[~mgx_genus.taxa.str.contains("virus")]
mgx_genus = mgx_genus[~mgx_genus.taxa.str.contains("Candidatus")]
mgx_genus = mgx_genus[~mgx_genus.taxa.str.contains("candidate")]

In [10]:
mgx_genus = mgx_genus.groupby(['taxa']).sum().reset_index() 

In [11]:
mgx_genus.head()

Unnamed: 0,taxa,G69146,G69147,G69148,G69149,G69150,G69151,G69152,G69153,G69154,...,G80617,G80619,G80620,G80621,G80623,G80624,G86980,G86986,G90143,G90147
0,Absiella,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Acidaminococcus,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01355,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Acinetobacter,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Actinobaculum,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001091
4,Actinomyces,0.0,0.0,0.0,0.0,0.0,0.0,4.5e-05,0.0,8.3e-05,...,0.0,0.0,0.0,0.0,0.0,0.000766,0.0,0.0,1.1e-05,5.1e-05


### Families

In [12]:
mgx_family = pd.read_csv("diabimmune_mgx_family.csv")

In [13]:
mgx_family.head()

Unnamed: 0,taxname,G69146,G69147,G69148,G69149,G69150,G69151,G69152,G69153,G69154,...,G80617,G80619,G80620,G80621,G80623,G80624,G86980,G86986,G90143,G90147
0,Enterobacteriaceae,0.46608,0.32781,0.012988,0.045543,0.020078,0.042352,0.055992,0.0,0.003681,...,0.196305,0.021126,0.037293,0.75199,4e-05,0.001803,0.498086,0.0,0.006707,0.023033
1,Akkermansiaceae,0.242453,0.00563,0.0,0.0,0.0,0.0,0.079431,0.052597,0.003702,...,0.0,0.0,0.0,0.0,2.2e-05,0.883796,0.186946,0.0,0.002018,0.573473
2,Bacteroidaceae,0.066054,0.0,0.097642,0.22818,0.0,0.496944,0.0,0.586511,0.498754,...,0.615055,0.000142,0.001016,0.017285,0.53886,0.054789,0.075458,0.138963,0.829793,2.7e-05
3,Lachnospiraceae,0.060958,0.343531,0.457044,0.130974,0.698545,0.0,0.049892,0.08194,0.009887,...,0.0,0.062178,0.822903,0.001328,0.152402,0.011259,0.074158,0.031986,0.093438,0.000593
4,Veillonellaceae,0.058309,0.00877,0.015439,0.02237,0.002777,0.230625,0.195566,0.00018,0.019004,...,0.120381,0.089666,0.060263,0.151497,0.002519,1e-05,0.03285,0.0,0.00598,0.342829


### Looking at species in metagenomic data

In [14]:
# mgx_species = mgx[mgx['taxa'].str.contains("\|s__")] # keep families genera
# mgx_species = mgx_species[~mgx_species['taxa'].str.contains("\|t__")] # remove genera

# mgx_species["taxa"] = mgx_species['taxa'].str.split("\|s__").str[-1]

In [15]:
# mgx_species = mgx_species[~mgx_species.taxa.str.contains("_unclassified")]
# mgx_species = mgx_species[~mgx_species.taxa.str.contains("_noname")]
# mgx_species = mgx_species[~mgx_species.taxa.str.contains("virus")]
# mgx_species = mgx_species[~mgx_species.taxa.str.contains("Candidatus")]
# mgx_species = mgx_species[~mgx_species.taxa.str.contains("candidate")]

In [16]:
# mgx_species = mgx_species.groupby(['taxa']).sum().reset_index() 

In [17]:
mgx_species = pd.read_csv("diabimmune_mgx_species.csv")
# mgx_genus.rename(columns={'taxname': 'taxa'}, inplace=True)

In [18]:
mgx_species.head()

Unnamed: 0,taxname,G69146,G69147,G69148,G69149,G69150,G69151,G69152,G69153,G69154,...,G80617,G80619,G80620,G80621,G80623,G80624,G86980,G86986,G90143,G90147
0,Escherichia_coli,0.284918,0.32781,0.004385,0.045543,0.020078,0.042352,0.050934,0.0,0.003681,...,0.196305,5.6e-05,0.036164,0.617818,4e-05,0.000136,0.319663,0.0,0.006707,0.0
1,Akkermansia_muciniphila,0.242453,0.00563,0.0,0.0,0.0,0.0,0.079431,0.052597,0.003702,...,0.0,0.0,0.0,0.0,2.2e-05,0.883796,0.186946,0.0,0.002018,0.573473
2,Klebsiella_pneumoniae,0.133182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.001958,0.0,0.0,0.132023,0.0,0.0,0.015799
3,Dialister_invisus,0.057785,0.0,0.015439,7.8e-05,0.002291,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1e-05,0.032804,0.0,0.001019,0.0
4,Bifidobacterium_longum,0.041493,0.0,0.01271,0.11927,0.005591,0.171838,0.186589,0.009908,0.050695,...,0.0,0.011101,0.0002,0.000713,0.167309,0.006347,0.05674,0.344709,0.0,0.02162


In [19]:
len(mgx_species)

511

## Importing amplicon data

### Genus

In [20]:
# read in 16S data

amp_genus = pd.read_csv("karalia_dada2_genera.csv")
amp_genus.rename(columns={'genus': 'taxa'}, inplace=True)

In [21]:
amp_genus.head()

Unnamed: 0,taxa,G63225,G63227,G63228,G63229,G63230,G63231,G63232,G63234,G63236,...,G76485,G76486,G76487,G76488,G76489,G76490,G76491,G76492,G76493,G76494
0,Bifidobacterium,0.004997,0.010805,0.007308,0.000231,0.003715,0.007665,0.104364,0.027547,0.014981,...,0.230535,0.041893,0.248718,0.003288,0.235009,0.237299,0.0,0.860111,0.6875,0.012104
1,Bacteroides,0.436124,0.343189,0.070793,0.059532,0.304362,0.183758,0.0,0.01948,0.15032,...,0.308592,0.151767,0.36005,0.040434,0.324846,0.480689,0.0,0.0,0.0,0.000242
2,UNCLASSIFIED,0.146095,0.031704,0.195331,0.224378,0.147593,0.510983,0.054241,0.201836,0.310966,...,0.322257,0.640546,0.136235,0.693582,0.13944,0.240539,0.47373,0.048752,0.031179,0.764585
3,Faecalibacterium,0.036886,0.016238,0.113969,0.035317,0.230022,0.039644,0.000433,0.227238,0.007121,...,0.0,0.0,0.025008,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Prevotella,0.0,0.0,0.07075,0.0,0.0,9.2e-05,0.00156,0.0,0.024074,...,0.000688,0.003179,0.0,0.0,0.0,0.005573,0.000861,0.0,0.0,0.0


In [22]:
# remove taxa that are unclassified or have no name
# "_unclassified"
# "_noname"
amp_genus = amp_genus[~amp_genus.taxa.str.contains("_unclassified")]
amp_genus = amp_genus[~amp_genus.taxa.str.contains("_noname")]
amp_genus = amp_genus[~amp_genus.taxa.str.contains("virus")]
amp_genus = amp_genus[~amp_genus.taxa.str.contains("Candidatus")]
amp_genus = amp_genus[~amp_genus.taxa.str.contains(r'[0-9]')]
amp_genus = amp_genus[~amp_genus.taxa.str.contains("group")]
amp_genus['taxa'].replace('', np.nan, inplace=True)
amp_genus.dropna(subset=['taxa'], inplace=True)

In [23]:
amp_genus.taxa = amp_genus.taxa = amp_genus.taxa.str.strip('[]')

In [24]:
# remove digits
amp_genus.taxa = amp_genus[~amp_genus.taxa.str.contains(r'\d')]

In [25]:
amp_genus = amp_genus.groupby(['taxa']).sum().reset_index() 

In [26]:
amp_genus = pd.DataFrame(amp_genus).rename(columns=id_mapping)

In [27]:
amp_genus = amp_genus.loc[:, amp_genus.columns.notnull()]

In [28]:
amp_genus.head()

Unnamed: 0,taxa,G78487,G78488,G78489,G78490,G78492,G78493,G78494,G78495,G78496,...,G80605,G80594,G80611,G80617,G80623,G80556,G80563,G80570,G80576,G80582
0,Abiotrophia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Acetanaerobacterium,0.0,0.000187,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Acetobacter,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Achromobacter,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Acidaminococcus,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
amp_genus.shape

(302, 842)

In [30]:
mgx_genus.shape

(198, 801)

### amplicon family

In [31]:
# amp_family = amp[amp['taxa'].str.contains("\|f__")] # keep families genera
# amp_family = amp_family[~amp_family['taxa'].str.contains("\|g__")] # remove genera

# amp_family["taxa"] = amp_family['taxa'].str.split("\|f__").str[-1]

In [32]:
# amp_family = amp_family[~amp_family.taxa.str.contains("_unclassified")]
# amp_family = amp_family[~amp_family.taxa.str.contains("_noname")]
# amp_family = amp_family[~amp_family.taxa.str.contains("virus")]
# amp_family = amp_family[~amp_family.taxa.str.contains("Candidatus")]
# amp_family = amp_family[~amp_family.taxa.str.contains(r'[0-9]')]
# amp_family = amp_family[~amp_family.taxa.str.contains("group")]
# amp_family['taxa'].replace('', np.nan, inplace=True)
# amp_family.dropna(subset=['taxa'], inplace=True)

In [33]:
# amp_family = amp_family.groupby(['taxa']).sum().reset_index() 

In [34]:
# amp_family.sort_values(by = "taxa", ascending=True, inplace=True)

In [35]:
amp_family = pd.read_csv("karalia_dada2_family.csv")
amp_family.rename(columns={'family': 'taxa'}, inplace=True)

In [36]:
amp_family.head()

Unnamed: 0,taxa,G63225,G63227,G63228,G63229,G63230,G63231,G63232,G63234,G63236,...,G76485,G76486,G76487,G76488,G76489,G76490,G76491,G76492,G76493,G76494
0,Bifidobacteriaceae,0.004997,0.010805,0.007308,0.000231,0.003715,0.007665,0.104364,0.027547,0.014981,...,0.230535,0.041893,0.248718,0.003288,0.235009,0.237299,0.0,0.860111,0.6875,0.012104
1,Bacteroidaceae,0.436124,0.343189,0.070793,0.059532,0.304362,0.183758,0.0,0.01948,0.15032,...,0.308592,0.151767,0.36005,0.040434,0.324846,0.480689,0.0,0.0,0.0,0.000242
2,Enterobacteriaceae,0.001728,0.000386,0.0,0.200328,0.000362,0.020244,0.00104,0.000343,0.284879,...,0.320881,0.587339,0.096265,0.590793,0.121032,0.237558,0.431094,0.048752,0.031179,0.736747
3,Ruminococcaceae,0.137221,0.04334,0.217802,0.036878,0.311303,0.081439,0.002709,0.293401,0.014379,...,0.0,0.002151,0.113843,0.0,0.000772,0.0,0.0,0.0,0.0,0.0
4,Prevotellaceae,5.8e-05,0.006946,0.07075,0.0,0.0,0.014278,0.00156,0.0,0.024074,...,0.000688,0.003179,0.0,0.0,0.0,0.005573,0.000861,0.0,0.0,0.0


In [37]:
# ratio of unclassified genera for 16S profiling
# sum(amplicon.taxa.str.contains("_unclassified| unidentified| uncultured| group"))/amplicon.shape[0]

In [38]:
# amplicon.to_csv('16S_abundance.csv')

### amplicon species

In [39]:
amp_species = pd.read_csv("karalia_dada2_species.csv")
amp_species.rename(columns={'species': 'taxa'}, inplace=True)

In [40]:
amp_species["taxa"] = amp_species['taxa'].str.replace('s__', '')

In [41]:
amp_species.head()

Unnamed: 0,taxa,G63225,G63227,G63228,G63229,G63230,G63231,G63232,G63234,G63236,...,G76485,G76486,G76487,G76488,G76489,G76490,G76491,G76492,G76493,G76494
0,UNCLASSIFIED,0.958397,0.413988,0.716716,0.968321,0.838273,0.950505,0.680969,0.944478,0.696196,...,0.929316,0.910137,0.89683,0.905858,0.7403,0.98056,0.830749,0.993131,0.954571,1
1,Bifidobacterium_bifidum,0.0,0.0,0.001202,0.0,0.0,0.000312,0.0,0.0,0.0,...,0.049548,0.015803,0.0,0.0,0.0,0.011534,0.0,0.006869,0.045429,0
2,Akkermansia_muciniphila,0.0,0.519963,0.205563,0.0,0.0,0.0,0.0,0.00635,0.272486,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,Ruminococcus_gnavus,0.000701,0.001751,0.0,0.012652,0.001983,0.0,0.309323,0.000644,0.003026,...,0.0,0.070881,0.0,0.0,0.017306,0.0,0.0,0.0,0.0,0
4,Parabacteroides_merdae,0.0,0.010123,0.011737,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.055457,0.0,0.241292,0.001944,0.0,0.0,0.0,0


### looking for the intersection of taxonomy at genus level
#### calculate number of taxa in both vs. either one

In [42]:
# calculate number of taxa in both vs. either one

In [43]:
amplicon_genera_list = set(amp_genus["taxa"])
mgx_genera_list = set(mgx_genus["taxa"])

In [44]:
intersection_genera = amplicon_genera_list.intersection(mgx_genera_list)
union_genera = amplicon_genera_list.union(mgx_genera_list)

In [45]:
len(intersection_genera)
len(amplicon_genera_list)-len(intersection_genera)
len(mgx_genera_list)-len(intersection_genera)

151

151

47

In [46]:
amplicon_genera = amplicon_genera_list.difference(intersection_genera)
mgx_genera = mgx_genera_list.difference(intersection_genera)

In [47]:
len(amplicon_genera)
len(mgx_genera)

151

47

### looking for the intersection of taxonomy at family level

In [48]:
amplicon_family_list = set(amp_family["taxa"])
mgx_family_list = set(mgx_family["taxname"])

intersection_family = amplicon_family_list.intersection(mgx_family_list)
union_family = amplicon_family_list.union(mgx_family_list)

len(intersection_family)
len(amplicon_family_list)-len(intersection_family)
len(mgx_family_list)-len(intersection_family)

63

68

9

In [49]:
amplicon_family = amplicon_family_list.difference(intersection_family)
mgx_family = mgx_family_list.difference(intersection_family)

len(amplicon_family)
len(mgx_family)

68

9

### looking for the intersection of taxonomy at species level

In [50]:
mgx_species.head()

Unnamed: 0,taxname,G69146,G69147,G69148,G69149,G69150,G69151,G69152,G69153,G69154,...,G80617,G80619,G80620,G80621,G80623,G80624,G86980,G86986,G90143,G90147
0,Escherichia_coli,0.284918,0.32781,0.004385,0.045543,0.020078,0.042352,0.050934,0.0,0.003681,...,0.196305,5.6e-05,0.036164,0.617818,4e-05,0.000136,0.319663,0.0,0.006707,0.0
1,Akkermansia_muciniphila,0.242453,0.00563,0.0,0.0,0.0,0.0,0.079431,0.052597,0.003702,...,0.0,0.0,0.0,0.0,2.2e-05,0.883796,0.186946,0.0,0.002018,0.573473
2,Klebsiella_pneumoniae,0.133182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.001958,0.0,0.0,0.132023,0.0,0.0,0.015799
3,Dialister_invisus,0.057785,0.0,0.015439,7.8e-05,0.002291,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1e-05,0.032804,0.0,0.001019,0.0
4,Bifidobacterium_longum,0.041493,0.0,0.01271,0.11927,0.005591,0.171838,0.186589,0.009908,0.050695,...,0.0,0.011101,0.0002,0.000713,0.167309,0.006347,0.05674,0.344709,0.0,0.02162


In [51]:
amp_species.head()

Unnamed: 0,taxa,G63225,G63227,G63228,G63229,G63230,G63231,G63232,G63234,G63236,...,G76485,G76486,G76487,G76488,G76489,G76490,G76491,G76492,G76493,G76494
0,UNCLASSIFIED,0.958397,0.413988,0.716716,0.968321,0.838273,0.950505,0.680969,0.944478,0.696196,...,0.929316,0.910137,0.89683,0.905858,0.7403,0.98056,0.830749,0.993131,0.954571,1
1,Bifidobacterium_bifidum,0.0,0.0,0.001202,0.0,0.0,0.000312,0.0,0.0,0.0,...,0.049548,0.015803,0.0,0.0,0.0,0.011534,0.0,0.006869,0.045429,0
2,Akkermansia_muciniphila,0.0,0.519963,0.205563,0.0,0.0,0.0,0.0,0.00635,0.272486,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,Ruminococcus_gnavus,0.000701,0.001751,0.0,0.012652,0.001983,0.0,0.309323,0.000644,0.003026,...,0.0,0.070881,0.0,0.0,0.017306,0.0,0.0,0.0,0.0,0
4,Parabacteroides_merdae,0.0,0.010123,0.011737,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.055457,0.0,0.241292,0.001944,0.0,0.0,0.0,0


In [52]:
amplicon_species_list = set(amp_species["taxa"])
mgx_species_list = set(mgx_species["taxname"])

intersection_species = amplicon_species_list.intersection(mgx_species_list)
union_species = amplicon_species_list.union(mgx_species_list)

len(intersection_species)
len(amplicon_species_list)-len(intersection_species)
len(mgx_species_list)-len(intersection_species)

127

111

384

In [53]:
amplicon_species = amplicon_species_list.difference(intersection_species)
mgx_species = mgx_species_list.difference(intersection_species)

len(amplicon_species)
len(mgx_species)

111

384

### creating long-form dataframe with data from both sequencing methods

In [54]:
mgx_genus.head()

Unnamed: 0,taxa,G69146,G69147,G69148,G69149,G69150,G69151,G69152,G69153,G69154,...,G80617,G80619,G80620,G80621,G80623,G80624,G86980,G86986,G90143,G90147
0,Absiella,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Acidaminococcus,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01355,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Acinetobacter,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Actinobaculum,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001091
4,Actinomyces,0.0,0.0,0.0,0.0,0.0,0.0,4.5e-05,0.0,8.3e-05,...,0.0,0.0,0.0,0.0,0.0,0.000766,0.0,0.0,1.1e-05,5.1e-05


In [55]:
amp_genus.head()

Unnamed: 0,taxa,G78487,G78488,G78489,G78490,G78492,G78493,G78494,G78495,G78496,...,G80605,G80594,G80611,G80617,G80623,G80556,G80563,G80570,G80576,G80582
0,Abiotrophia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Acetanaerobacterium,0.0,0.000187,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Acetobacter,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Achromobacter,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Acidaminococcus,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
# only keeping samples we have both mgx and 16S profiles for
samples_intersect = set(mgx_genus.columns.values).intersection(set(amp_genus.columns.values))

In [57]:
# remove mothers
children_intersect = [id for id in samples_intersect if not id.startswith("M")]

In [58]:
len(amp_genus.columns.values)

842

In [59]:
mgx_genus.columns.values

array(['taxa', 'G69146', 'G69147', 'G69148', 'G69149', 'G69150', 'G69151',
       'G69152', 'G69153', 'G69154', 'G69155', 'G69156', 'G69157',
       'G69158', 'G69159', 'G69160', 'G69161', 'G69162', 'G69163',
       'G69164', 'G69165', 'G69166', 'G69167', 'G69168', 'G69169',
       'G69170', 'G69171', 'G69172', 'G69173', 'G69174', 'G69175',
       'G69176', 'G69177', 'G69178', 'G69179', 'G69180', 'G69181',
       'G69182', 'G69183', 'G69184', 'G69185', 'G69186', 'G69187',
       'G69188', 'G69189', 'G69190', 'G69191', 'G69192', 'G69193',
       'G69194', 'G69195', 'G69196', 'G69197', 'G69198', 'G69199',
       'G69201', 'G69202', 'G69203', 'G69204', 'G69205', 'G69206',
       'G69207', 'G69208', 'G69209', 'G69210', 'G69211', 'G69212',
       'G69213', 'G69214', 'G69215', 'G69216', 'G69217', 'G69218',
       'G69219', 'G69221', 'G69222', 'G69223', 'G69224', 'G69225',
       'G69226', 'G69227', 'G69228', 'G69229', 'G69230', 'G69231',
       'G69232', 'G69233', 'G69234', 'G69235', 'G69236

In [60]:
samples_intersect

{'G69146',
 'G69147',
 'G69148',
 'G69149',
 'G69150',
 'G69152',
 'G69153',
 'G69154',
 'G69155',
 'G69156',
 'G69157',
 'G69158',
 'G69159',
 'G69160',
 'G69161',
 'G69162',
 'G69163',
 'G69164',
 'G69166',
 'G69167',
 'G69170',
 'G69171',
 'G69172',
 'G69173',
 'G69174',
 'G69175',
 'G69176',
 'G69177',
 'G69178',
 'G69179',
 'G69180',
 'G69181',
 'G69182',
 'G69183',
 'G69184',
 'G69185',
 'G69186',
 'G69187',
 'G69188',
 'G69189',
 'G69190',
 'G69191',
 'G69192',
 'G69193',
 'G69194',
 'G69195',
 'G69196',
 'G69197',
 'G69198',
 'G69199',
 'G69201',
 'G69202',
 'G69203',
 'G69204',
 'G69205',
 'G69206',
 'G69207',
 'G69208',
 'G69209',
 'G69210',
 'G69211',
 'G69212',
 'G69213',
 'G69214',
 'G69216',
 'G69217',
 'G69218',
 'G69219',
 'G69221',
 'G69222',
 'G69223',
 'G69224',
 'G69225',
 'G69226',
 'G69227',
 'G69228',
 'G69229',
 'G69230',
 'G69231',
 'G69232',
 'G69233',
 'G69234',
 'G69235',
 'G69236',
 'G69238',
 'G78487',
 'G78488',
 'G78489',
 'G78490',
 'G78492',
 'G78493',

In [61]:
len(samples_intersect)

776

In [93]:
metadata.head()

Unnamed: 0,subjectID,SampleID,age_at_collection,collection_month,delivery,gest_time,gender,country,Exclusive_breast_feeding,Breast_feeding_end,...,allergy_dog,allergy_birch,allergy_timothy,gid_wgs,mgx_reads,mgx_pool,mgx_reads_filtered,read_count_16S,sequencing_PDO_16S,gid_16s
5,E002338,3101193,304,10,vaginal,285,Female,FIN,False,False,...,False,False,False,G80541,28876314,Plate 7,28.568962,15870,PDO-4356,G75694
6,E002338,3107294,493,16,vaginal,285,Female,FIN,False,True,...,False,False,False,G80537,29615868,Plate 7,29.542272,13609,PDO-4356,G75855
7,E002338,3113022,852,28,vaginal,285,Female,FIN,False,True,...,False,False,False,G80322,30478184,Plate 8,30.41078,34620,PDO-4157,G73882
8,E002338,3107293,399,13,vaginal,285,Female,FIN,False,False,...,False,False,False,G80513,28503488,Plate 7,28.360762,7337,PDO-4356,G75849
15,E002338,3101190,212,7,vaginal,285,Female,FIN,False,False,...,False,False,False,,23928698,Diab Plate 9,23.824676,18931,PDO-4356,G75788


In [94]:
metadata_filtered = metadata[metadata['gid_wgs'].isin(samples_intersect)]

In [100]:
metadata_filtered["age_months"] = pd.to_numeric(metadata_filtered["age_at_collection"])/30

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [103]:
def apply_dev_stage(row):
    if row < 15:
        return "less than 15 months"
    elif row <= 30:
        return "15 to 30 months"
    else:
        return "older than 30 months"

In [105]:
metadata_filtered['dev_stage'] = metadata_filtered["age_months"].apply(apply_dev_stage)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [106]:
metadata_filtered.head()

Unnamed: 0,subjectID,SampleID,age_at_collection,collection_month,delivery,gest_time,gender,country,Exclusive_breast_feeding,Breast_feeding_end,...,allergy_timothy,gid_wgs,mgx_reads,mgx_pool,mgx_reads_filtered,read_count_16S,sequencing_PDO_16S,gid_16s,age_months,dev_stage
5,E002338,3101193,304,10,vaginal,285,Female,FIN,False,False,...,False,G80541,28876314,Plate 7,28.568962,15870,PDO-4356,G75694,10.133333,less than 15 months
6,E002338,3107294,493,16,vaginal,285,Female,FIN,False,True,...,False,G80537,29615868,Plate 7,29.542272,13609,PDO-4356,G75855,16.433333,15 to 30 months
7,E002338,3113022,852,28,vaginal,285,Female,FIN,False,True,...,False,G80322,30478184,Plate 8,30.41078,34620,PDO-4157,G73882,28.4,15 to 30 months
8,E002338,3107293,399,13,vaginal,285,Female,FIN,False,False,...,False,G80513,28503488,Plate 7,28.360762,7337,PDO-4356,G75849,13.3,less than 15 months
19,E002338,3000144,58,2,vaginal,285,Female,FIN,False,False,...,False,G80490,18044614,Plate 7,17.982269,9209,PDO-4356,G75872,1.933333,less than 15 months


In [107]:
metadata_filtered.groupby('dev_stage').count()

Unnamed: 0,subjectID,SampleID,age_at_collection,collection_month,delivery,gest_time,gender,country,Exclusive_breast_feeding,Breast_feeding_end,...,allergy_birch,allergy_timothy,gid_wgs,mgx_reads,mgx_pool,mgx_reads_filtered,read_count_16S,sequencing_PDO_16S,gid_16s,age_months
dev_stage,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15 to 30 months,340,340,340,340,334,334,340,340,340,340,...,314,314,340,340,340,340,340,340,340,340
less than 15 months,389,389,389,389,385,385,389,389,389,389,...,364,364,389,389,389,385,389,389,389,389
older than 30 months,46,46,46,46,45,45,46,46,46,46,...,44,44,46,46,46,46,46,46,46,46


In [63]:
mgx = mgx_genus[children_intersect]
amplicon = amp_genus[children_intersect]

In [64]:
# confirm that we now have the same samples
(amp_genus.columns.values) == (mgx_genus.columns.values)

  


False

In [65]:
amplicon_melt = pd.melt(amplicon, id_vars=["taxa"], var_name = "sampleid", value_name = "amplicon_abund")

In [66]:
mgx_melt = pd.melt(mgx, id_vars=["taxa"], var_name = "sampleid", value_name = "mgx_abund")

In [67]:
merged_taxa = pd.merge(amplicon_melt, mgx_melt, on = ["sampleid", "taxa"], how = "outer")

In [68]:
merged_taxa.to_csv('taxa_abundance_comparison.csv')

In [69]:
merged_taxa["abs_diff"] = abs(merged_taxa["amplicon_abund"] - merged_taxa["mgx_abund"])
merged_taxa["tot_diff"] = (merged_taxa["amplicon_abund"] - merged_taxa["mgx_abund"])

In [70]:
merged_taxa.sample(10)

Unnamed: 0,taxa,sampleid,amplicon_abund,mgx_abund,abs_diff,tot_diff
141285,Rothia,G78638,0.000195,0.0,0.000195,0.000195
235159,Leclercia,G80309,,0.0,,
166209,Enterorhabdus,G78511,0.0,0.0,0.0,0.0
31158,Brevibacterium,G80267,0.0,,,
74615,Anaerocolumna,G78611,0.0,0.0,0.0,0.0
119096,Enterococcus,G78914,0.0,0.0,0.0,0.0
216400,Libanicoccus,G78796,0.0,,,
92270,Lactobacillus,G80591,0.0,0.0,0.0,0.0
257983,Bavariicoccus,G80296,,0.0,,
12003,Phascolarctobacterium,G80557,0.0,0.0,0.0,0.0


In [71]:
merged_taxa.fillna(0)

Unnamed: 0,taxa,sampleid,amplicon_abund,mgx_abund,abs_diff,tot_diff
0,Abiotrophia,G69213,0.000000,0.000000,0.000000,0.000000
1,Acetanaerobacterium,G69213,0.000000,0.000000,0.000000,0.000000
2,Acetobacter,G69213,0.000000,0.000000,0.000000,0.000000
3,Achromobacter,G69213,0.000000,0.000000,0.000000,0.000000
4,Acidaminococcus,G69213,0.000000,0.000000,0.000000,0.000000
5,Acinetobacter,G69213,0.000035,0.000000,0.000035,0.000035
6,Actinobacillus,G69213,0.000000,0.000000,0.000000,0.000000
7,Actinomyces,G69213,0.000000,0.000000,0.000000,0.000000
8,Actinotignum,G69213,0.000000,0.000000,0.000000,0.000000
9,Adlercreutzia,G69213,0.000000,0.000017,0.000017,-0.000017


In [72]:
amplicon_avg_abund = merged_taxa.groupby("taxa")["amplicon_abund"].mean()
mgx_avg_abund = merged_taxa.groupby("taxa")["mgx_abund"].mean()
taxa_list = sorted(set(merged_taxa["taxa"]))

In [73]:
mean_taxa_abund = pd.DataFrame(
    (zip(taxa_list, amplicon_avg_abund, mgx_avg_abund)),  
    columns = ['taxa','amp_avg_abund', 'mgx_avg_abund'])

In [74]:
mean_taxa_abund["abs_diff"] = abs(mean_taxa_abund["amp_avg_abund"] - mean_taxa_abund["mgx_avg_abund"])
mean_taxa_abund["total_diff"] = mean_taxa_abund["amp_avg_abund"] - mean_taxa_abund["mgx_avg_abund"]

In [75]:
mean_taxa_abund.sort_values("abs_diff", axis = 0, ascending = True, 
                 inplace = True, na_position ='last')

In [76]:
mean_taxa_abund.sample(10)

Unnamed: 0,taxa,amp_avg_abund,mgx_avg_abund,abs_diff,total_diff
146,Fretibacterium,3.6e-08,2.458065e-07,2.098065e-07,-2.098065e-07
194,Legionella,0.0,,,
131,Erysipelothrix,,1.566645e-05,,
284,Rhodococcus,1.48129e-07,,,
105,Dechloromonas,0.0,,,
120,Eikenella,3.213077e-06,4.806452e-07,2.732432e-06,2.732432e-06
23,Allisonella,0.0001130362,2.307484e-05,8.996133e-05,8.996133e-05
208,Mesorhizobium,1.03871e-07,,,
274,Psychrobacillus,0.0,,,
67,Buttiauxella,,5.984516e-07,,


In [77]:
mean_taxa_abund.to_csv('taxa_difference.csv')

## making giant dataframe of abundances

In [78]:
amp_trans = amplicon.set_index("taxa").transpose()

In [79]:
amp_trans.reset_index(level=0, inplace=True)

In [80]:
amp_trans.rename(columns = {'index':'sampleid'}, inplace = True) 

In [81]:
amp_trans["uid"] = amp_trans["sampleid"].astype(str)+'-amp'

In [82]:
amp_trans["method"] = "amp"

In [83]:
amp_trans.head()

taxa,sampleid,Abiotrophia,Acetanaerobacterium,Acetobacter,Achromobacter,Acidaminococcus,Acinetobacter,Actinobacillus,Actinomyces,Actinotignum,...,Veillonellaceae,Verticiella,Victivallaceae,Victivallis,Vulcaniibacterium,Weissella,Xanthomonas,Zoogloea,uid,method
0,G69213,0.0,0.0,0.0,0.0,0.0,3.5e-05,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,G69213-amp,amp
1,G80594,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,G80594-amp,amp
2,G78901,0.0,0.0,0.0,0.0,0.0,0.000338,0.0,0.000879,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,G78901-amp,amp
3,G80543,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,G80543-amp,amp
4,G80417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001079,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,G80417-amp,amp


In [84]:
mgx_trans = mgx.set_index("taxa").transpose()

In [85]:
mgx_trans.reset_index(level=0, inplace=True)

In [86]:
mgx_trans.rename(columns = {'index':'sampleid'}, inplace = True)

In [87]:
mgx_trans["uid"] = mgx_trans["sampleid"].astype(str)+'-mgx'

In [88]:
mgx_trans["method"] = "mgx"

In [89]:
mgx_trans.head()

taxa,sampleid,Absiella,Acidaminococcus,Acinetobacter,Actinobaculum,Actinomyces,Actinotignum,Adlercreutzia,Aeriscardovia,Aeromonas,...,Turicimonas,Tyzzerella,UNCLASSIFIED,Varibaculum,Veillonella,Victivallis,Weissella,Yersinia,uid,method
0,G69213,0.0,0.0,0.0,0.0,0.0,0.0,1.7e-05,0.0,0.0,...,0.0,0.0,0.012562,0.0,0.0,0.0,0.0,0.0,G69213-mgx,mgx
1,G80594,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000101,0.0,...,0.0,0.0,0.0,0.0,0.010218,0.0,0.0,0.0,G80594-mgx,mgx
2,G78901,0.0,0.0,0.0,0.0,0.000284,0.0,0.0,0.00027,0.0,...,0.0,0.0,0.0,0.0,0.000364,0.0,0.0,0.0,G78901-mgx,mgx
3,G80543,0.0,0.0,0.001198,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.007696,0.0,0.0,0.0,G80543-mgx,mgx
4,G80417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.8e-05,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,G80417-mgx,mgx


In [90]:
concat_df = pd.concat([mgx_trans,amp_trans], sort=True).reset_index(drop = True)

In [91]:
len(concat_df)

1550

In [92]:
age = pd.read_csv("~/Documents/thesis/analysis/metadatawide.csv")

FileNotFoundError: [Errno 2] File b'/Users/danielle/Documents/thesis/analysis/metadatawide.csv' does not exist: b'/Users/danielle/Documents/thesis/analysis/metadatawide.csv'

In [None]:
age = age[['sample','childAgeMonths']]

In [None]:
age["sample"] = age["sample"].str.replace("_",'-')
age.rename(columns = {'sample':'sampleid'}, inplace = True) 

In [None]:
# make age dictionary
agedict = {str(s): {} for s in age["sampleid"]}
for index, row in age.iterrows():
    age_months = row["childAgeMonths"]
    agedict[row["sampleid"]]= age_months

In [None]:
concat_df["AgeMonths"]= concat_df["sampleid"].map(agedict)

In [None]:
cols_to_order = ['uid', 'sampleid',"method", "AgeMonths"]
new_columns = cols_to_order + (concat_df.columns.drop(cols_to_order).tolist())
concat_df = concat_df[new_columns]

In [None]:
concat_df.sample(15)

In [None]:
concat_df.to_csv('transposed_mgxamp_df.csv')

In [None]:
pwd()