In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import csv
import numpy as np
import matplotlib.pyplot as plt

import os
import glob
import re

In [3]:
# read in mgx data
original = pd.read_csv("diabimmune_mgx_genus.csv"\)
original.rename(columns={'ID': 'taxa'}, inplace=True)

In [4]:
mgx_genus = original[original['taxa'].str.contains("\|g__")] # keep genera
mgx_genus = mgx_genus[~mgx_genus['taxa'].str.contains("\|s__")] # keep species
mgx_genus["taxa"] = mgx_genus['taxa'].str.split("\|g__").str[-1]
mgx_genus["taxa"] = mgx_genus['taxa'].str.split("\|s__").str[0]#

KeyError: 'taxa'

In [4]:
# remove taxa that are unclassified or have no name
# "_unclassified"
# "_noname"
mgx_genus = mgx_genus[~mgx_genus.taxa.str.contains("_unclassified")]
mgx_genus = mgx_genus[~mgx_genus.taxa.str.contains("_noname")]
mgx_genus = mgx_genus[~mgx_genus.taxa.str.contains("virus")]
mgx_genus = mgx_genus[~mgx_genus.taxa.str.contains("Candidatus")]
mgx_genus = mgx_genus[~mgx_genus.taxa.str.contains("candidate")]

In [5]:
mgx_genus.head()

Unnamed: 0,taxa,G69146,G69147,G69148,G69149,G69150,G69152,G69153,G69154,G69155,...,G80612,G80613,G80614,G80615,G80616,G80619,G80620,G80621,G80623,G80624
5,Methanobrevibacter,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,Granulicella,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25,Actinobaculum,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31,Actinomyces,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03404,0.0,...,0.0,0.00114,0.14391,0.0,0.0,0.0,0.0,0.0,0.0,0.01833
52,Varibaculum,0.0,0.0,0.0,0.0,0.0,0.00198,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
mgx_genus = mgx_genus.groupby(['taxa']).sum().T

In [7]:
mgx_genus = mgx_genus.reset_index()

In [8]:
mgx_genus.rename(columns={"index": "sampleid"}, inplace=True)

In [9]:
mgx_genus.head()

taxa,sampleid,Abiotrophia,Acidaminococcus,Acinetobacter,Actinobacillus,Actinobaculum,Actinomyces,Adlercreutzia,Aeromonas,Aggregatibacter,...,Subdoligranulum,Succinatimonas,Sutterella,Tannerella,Turicibacter,Varibaculum,Variovorax,Veillonella,Weissella,Yersinia
0,G69146,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.26124,0.0,0.0,0.0,0.0,0.0,0.0,0.07238,0.0,0.0
1,G69147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.44759,0.0,0.0,0.0,0.0,0.0,0.0,0.98064,0.0,0.0
2,G69148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9.51395,0.0,0.0,0.0,0.0,0.0,0.0,0.07409,0.0,0.0
3,G69149,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00741,0.0,0.0
4,G69150,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.4115,0.0,0.0,0.0,0.0,0.0,0.0,0.07214,0.0,0.0


In [10]:
mgx_genus["uid"] = mgx_genus["sampleid"] + "_original"

In [11]:
# read in subsampled data
subsample = pd.read_csv("merged_metaphlan.tsv", sep="\t", 
                        skiprows=[0])
subsample.rename(columns={'clade_name': 'taxa'}, inplace=True) 

In [12]:
subsample.drop(['NCBI_tax_id'], axis=1, inplace=True)

In [13]:
subsample.head()

Unnamed: 0,taxa,G90147_750k_profile,G90147_500k_profile,G90147_250k_profile,G90147_100k_profile,G90147_1000k_profile,G90147_10000k_profile,G90143_750k_profile,G90143_500k_profile,G90143_250k_profile,...,G69146_750k_profile,G69146_500k_profile,G69146_250k_profile,G69146_100k_profile,G69146_1000k_profile,G69146_10000k_profile,G65860_750k_profile,G65860_250k_profile,G65860_100k_profile,G65860_1000k_profile
0,UNKNOWN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,k__Archaea,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,k__Archaea|p__Euryarchaeota,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,k__Archaea|p__Euryarchaeota|c__Methanobacteria,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,k__Archaea|p__Euryarchaeota|c__Methanobacteria...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
subsample_genus = subsample[subsample['taxa'].str.contains("\|g__")] # keep genera
subsample_genus = subsample_genus[~subsample_genus['taxa'].str.contains("\|s__")] # keep species
subsample_genus["taxa"] = subsample_genus['taxa'].str.split("\|g__").str[-1]
subsample_genus["taxa"] = subsample_genus['taxa'].str.split("\|s__").str[0]#

In [15]:
# remove taxa that are unclassified or have no name
# "_unclassified"
# "_noname"
subsample_genus = subsample_genus[~subsample_genus.taxa.str.contains("_unclassified")]
subsample_genus = subsample_genus[~subsample_genus.taxa.str.contains("_noname")]
subsample_genus = subsample_genus[~subsample_genus.taxa.str.contains("virus")]
subsample_genus = subsample_genus[~subsample_genus.taxa.str.contains("Candidatus")]
subsample_genus = subsample_genus[~subsample_genus.taxa.str.contains("candidate")]

In [16]:
subsample_genus = subsample_genus.groupby(['taxa']).sum().T

In [17]:
subsample_genus = subsample_genus.reset_index()

In [18]:
subsample_genus.rename(columns={"index": "uid"}, inplace=True)

In [19]:
split_list = subsample_genus["uid"].str.split('_')
subsample_genus["sampleid"] = [string[0] for string in split_list]
subsample_genus["read_depth"] = [string[1] for string in split_list]
subsample_genus['read_depth'] = subsample_genus['read_depth'].map(lambda x: x.lstrip('+-').rstrip('k')) # remove k

In [20]:
subsample_genus["sampleid"] = subsample_genus["sampleid"].astype(str)

In [21]:
subsample_genus.head()

taxa,uid,Absiella,Acidaminococcus,Acinetobacter,Actinobaculum,Actinomyces,Actinotignum,Adlercreutzia,Aeriscardovia,Aeromonas,...,Turicibacter,Turicimonas,Tyzzerella,Varibaculum,Veillonella,Victivallis,Weissella,Yersinia,sampleid,read_depth
0,G90147_750k_profile,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,34.21486,0.0,0.0,0.0,G90147,750
1,G90147_500k_profile,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,32.98802,0.0,0.0,0.0,G90147,500
2,G90147_250k_profile,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,34.80456,0.0,0.0,0.0,G90147,250
3,G90147_100k_profile,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,37.79919,0.0,0.0,0.0,G90147,100
4,G90147_1000k_profile,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,34.64051,0.0,0.0,0.0,G90147,1000


### Adding metadata

In [22]:
# adding age metadata
metadata = pd.read_csv("~/Documents/thesis/paper-diabimmune-method-comparison/diabimmune_metadata.csv", index_col = 0).T

In [23]:
metadata.rename(columns={"gid_wgs": "sampleid", 'mgx_reads':"read_depth"}, inplace=True)

In [24]:
metadata["age_at_collection"] = metadata['age_at_collection'].astype(str).astype(int)

In [25]:
metadata["sampleid"] = metadata["sampleid"].astype(str)

In [26]:
metadata.head()

Unnamed: 0,subjectID,SampleID,age_at_collection,collection_month,delivery,gest_time,gender,country,Exclusive_breast_feeding,Breast_feeding_end,...,allergy_dog,allergy_birch,allergy_timothy,sampleid,read_depth,mgx_pool,mgx_reads_filtered,read_count_16S,sequencing_PDO_16S,gid_16s
5,E002338,3101193,304,10,vaginal,285,Female,FIN,False,False,...,False,False,False,G80541,28876314,Plate 7,28.568962,15870,PDO-4356,G75694
6,E002338,3107294,493,16,vaginal,285,Female,FIN,False,True,...,False,False,False,G80537,29615868,Plate 7,29.542272,13609,PDO-4356,G75855
7,E002338,3113022,852,28,vaginal,285,Female,FIN,False,True,...,False,False,False,G80322,30478184,Plate 8,30.41078,34620,PDO-4157,G73882
8,E002338,3107293,399,13,vaginal,285,Female,FIN,False,False,...,False,False,False,G80513,28503488,Plate 7,28.360762,7337,PDO-4356,G75849
15,E002338,3101190,212,7,vaginal,285,Female,FIN,False,False,...,False,False,False,,23928698,Diab Plate 9,23.824676,18931,PDO-4356,G75788


In [27]:
age_dict = pd.Series(metadata.age_at_collection.values,index=metadata.sampleid).to_dict()

In [28]:
read_depth_dict = pd.Series(metadata.read_depth.values,index=metadata.sampleid).to_dict()

In [30]:
mgx_genus["AgeMonths"]= mgx_genus["sampleid"].map(age_dict)
mgx_genus["read_depth"] = mgx_genus["sampleid"].map(read_depth_dict)
mgx_genus["sampling_cat"] = "original depth"

In [31]:
mgx_genus.head()

taxa,sampleid,Abiotrophia,Acidaminococcus,Acinetobacter,Actinobacillus,Actinobaculum,Actinomyces,Adlercreutzia,Aeromonas,Aggregatibacter,...,Turicibacter,Varibaculum,Variovorax,Veillonella,Weissella,Yersinia,uid,AgeMonths,read_depth,sampling_cat
0,G69146,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.07238,0.0,0.0,G69146_original,686,18119742,original depth
1,G69147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.98064,0.0,0.0,G69147_original,173,13088576,original depth
2,G69148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.07409,0.0,0.0,G69148_original,531,10470666,original depth
3,G69149,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00741,0.0,0.0,G69149_original,347,13000628,original depth
4,G69150,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.07214,0.0,0.0,G69150_original,342,14763702,original depth


In [32]:
subsample_genus["AgeMonths"]= subsample_genus["sampleid"].map(age_dict)
subsample_genus["sampling_cat"] = subsample_genus["read_depth"]

In [33]:
subsample_genus.sample(5)

taxa,uid,Absiella,Acidaminococcus,Acinetobacter,Actinobaculum,Actinomyces,Actinotignum,Adlercreutzia,Aeriscardovia,Aeromonas,...,Tyzzerella,Varibaculum,Veillonella,Victivallis,Weissella,Yersinia,sampleid,read_depth,AgeMonths,sampling_cat
1377,G80363_500k_profile,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,G80363,500,614.0,500
748,G80483_10000k_profile,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04568,0.0,...,0.0,0.0,0.24643,0.0,0.0,0.0,G80483,10000,121.0,10000
3922,G78527_500k_profile,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,G78527,500,1075.0,500
1199,G80394_100k_profile,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,G80394,100,344.0,100
574,G80515_10000k_profile,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,20.5737,0.0,0.0,0.0,G80515,10000,215.0,10000


In [35]:
subsample_genus = subsample_genus[subsample_genus['AgeMonths'].notna()]

In [36]:
concat_df = pd.concat([mgx_genus,subsample_genus], sort=True).reset_index(drop = True)

In [37]:
concat_df = concat_df.fillna(0) 

In [38]:
concat_df.sample(10)

Unnamed: 0,Abiotrophia,Absiella,Acidaminococcus,Acinetobacter,Actinobacillus,Actinobaculum,Actinomyces,Actinotignum,Adlercreutzia,Aeriscardovia,...,Varibaculum,Variovorax,Veillonella,Victivallis,Weissella,Yersinia,read_depth,sampleid,sampling_cat,uid
2302,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,250,G80335,250,G80335_250k_profile
1942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,250,G80397,250,G80397_250k_profile
4721,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,250,G78520,250,G78520_250k_profile
1546,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.02058,0.0,0.0,10000,G80476,10000,G80476_10000k_profile
778,0.0,0.0,0.0,0.0,0.04346,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01983,0.0,0.0,0.0,35148444,G80615,original depth,G80615_original
2147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,100,G80362,100,G80362_100k_profile
4071,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,10000,G78638,10000,G78638_10000k_profile
3183,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,70.23909,0.0,0.0,0.0,250,G78834,250,G78834_250k_profile
168,0.0,0.0,0.0,0.0,0.02005,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.97425,0.0,0.0,0.0,26334876,G78580,original depth,G78580_original
2831,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1000,G78898,1000,G78898_1000k_profile


In [43]:
bins = [0, 450, 900, np.inf]
names = ['less than 15 months', '15 to 30 months', 'older than 30 months']

concat_df['dev_stage'] = pd.cut(concat_df['AgeMonths'], bins, labels=names)

In [44]:
cols_to_order = ['uid', 'sampleid', 'read_depth', 'AgeMonths', 'dev_stage', 'sampling_cat']
new_columns = cols_to_order + (concat_df.columns.drop(cols_to_order).tolist())
concat_df = concat_df[new_columns]

In [45]:
concat_df.sample(15)

Unnamed: 0,uid,sampleid,read_depth,AgeMonths,dev_stage,sampling_cat,Abiotrophia,Absiella,Acidaminococcus,Acinetobacter,...,Terrisporobacter,Turicibacter,Turicimonas,Tyzzerella,Varibaculum,Variovorax,Veillonella,Victivallis,Weissella,Yersinia
445,G78914_original,G78914,22647210,846.0,15 to 30 months,original depth,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.2813,0.0,0.0,0.0
1406,G80501_100k_profile,G80501,100,114.0,less than 15 months,100,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4175,G78619_1000k_profile,G78619,1000,474.0,15 to 30 months,1000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.05567,0.0,0.0,0.0
2663,G80273_100k_profile,G80273,100,840.0,15 to 30 months,100,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4763,G78512_750k_profile,G78512,750,598.0,15 to 30 months,750,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.83959,0.0,0.0,0.0
3935,G78666_500k_profile,G78666,500,500.0,15 to 30 months,500,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3663,G78728_1000k_profile,G78728,1000,706.0,15 to 30 months,1000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.423,0.0,0.0,0.0
2564,G80290_750k_profile,G80290,750,757.0,15 to 30 months,750,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4259,G78605_10000k_profile,G78605,10000,580.0,15 to 30 months,10000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.00586,0.0,0.0,0.0,0.66958,0.0,0.0,0.0
1329,G80516_1000k_profile,G80516,1000,44.0,less than 15 months,1000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
concat_df.to_csv('diabimmune_subsampled_df.csv', index = False)