In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import csv
import numpy as np
import matplotlib.pyplot as plt

import os
import glob
import re

In [2]:
# read in mgx data
original = pd.read_csv("diabimmune_karelia_metaphlan_table.txt", sep="\t")
original.rename(columns={'ID': 'taxa'}, inplace=True)

In [3]:
mgx_genus = original[original['taxa'].str.contains("\|g__")] # keep genera
mgx_genus = mgx_genus[~mgx_genus['taxa'].str.contains("\|s__")] # keep species
mgx_genus["taxa"] = mgx_genus['taxa'].str.split("\|g__").str[-1]
mgx_genus["taxa"] = mgx_genus['taxa'].str.split("\|s__").str[0]#

In [4]:
# remove taxa that are unclassified or have no name
# "_unclassified"
# "_noname"
mgx_genus = mgx_genus[~mgx_genus.taxa.str.contains("_unclassified")]
mgx_genus = mgx_genus[~mgx_genus.taxa.str.contains("_noname")]
mgx_genus = mgx_genus[~mgx_genus.taxa.str.contains("virus")]
mgx_genus = mgx_genus[~mgx_genus.taxa.str.contains("Candidatus")]
mgx_genus = mgx_genus[~mgx_genus.taxa.str.contains("candidate")]

In [5]:
mgx_genus.head()

Unnamed: 0,taxa,G69146,G69147,G69148,G69149,G69150,G69152,G69153,G69154,G69155,...,G80612,G80613,G80614,G80615,G80616,G80619,G80620,G80621,G80623,G80624
5,Methanobrevibacter,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,Granulicella,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25,Actinobaculum,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31,Actinomyces,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03404,0.0,...,0.0,0.00114,0.14391,0.0,0.0,0.0,0.0,0.0,0.0,0.01833
52,Varibaculum,0.0,0.0,0.0,0.0,0.0,0.00198,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
mgx_genus = mgx_genus.groupby(['taxa']).sum().T

In [7]:
mgx_genus = mgx_genus.reset_index()

In [8]:
mgx_genus.rename(columns={"index": "sampleid"}, inplace=True)

In [9]:
mgx_genus.head()

taxa,sampleid,Abiotrophia,Acidaminococcus,Acinetobacter,Actinobacillus,Actinobaculum,Actinomyces,Adlercreutzia,Aeromonas,Aggregatibacter,...,Subdoligranulum,Succinatimonas,Sutterella,Tannerella,Turicibacter,Varibaculum,Variovorax,Veillonella,Weissella,Yersinia
0,G69146,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.26124,0.0,0.0,0.0,0.0,0.0,0.0,0.07238,0.0,0.0
1,G69147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.44759,0.0,0.0,0.0,0.0,0.0,0.0,0.98064,0.0,0.0
2,G69148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9.51395,0.0,0.0,0.0,0.0,0.0,0.0,0.07409,0.0,0.0
3,G69149,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00741,0.0,0.0
4,G69150,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.4115,0.0,0.0,0.0,0.0,0.0,0.0,0.07214,0.0,0.0


In [10]:
len(mgx_genus.sampleid.unique())

785

In [11]:
# read in subsampled data
subsample = pd.read_csv("merged_metaphlan.tsv", sep="\t", 
                        skiprows=[0])
subsample.rename(columns={'clade_name': 'taxa'}, inplace=True) 

In [12]:
subsample.drop(['NCBI_tax_id'], axis=1, inplace=True)

In [13]:
subsample.head()

Unnamed: 0,taxa,G90147_750k_profile,G90147_500k_profile,G90147_250k_profile,G90147_100k_profile,G90147_1000k_profile,G90147_10000k_profile,G90143_750k_profile,G90143_500k_profile,G90143_250k_profile,...,G69146_750k_profile,G69146_500k_profile,G69146_250k_profile,G69146_100k_profile,G69146_1000k_profile,G69146_10000k_profile,G65860_750k_profile,G65860_250k_profile,G65860_100k_profile,G65860_1000k_profile
0,UNKNOWN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,k__Archaea,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,k__Archaea|p__Euryarchaeota,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,k__Archaea|p__Euryarchaeota|c__Methanobacteria,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,k__Archaea|p__Euryarchaeota|c__Methanobacteria...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
subsample_genus = subsample[subsample['taxa'].str.contains("\|g__")] # keep genera
subsample_genus = subsample_genus[~subsample_genus['taxa'].str.contains("\|s__")] # keep species
subsample_genus["taxa"] = subsample_genus['taxa'].str.split("\|g__").str[-1]
subsample_genus["taxa"] = subsample_genus['taxa'].str.split("\|s__").str[0]#

In [15]:
# remove taxa that are unclassified or have no name
# "_unclassified"
# "_noname"
subsample_genus = subsample_genus[~subsample_genus.taxa.str.contains("_unclassified")]
subsample_genus = subsample_genus[~subsample_genus.taxa.str.contains("_noname")]
subsample_genus = subsample_genus[~subsample_genus.taxa.str.contains("virus")]
subsample_genus = subsample_genus[~subsample_genus.taxa.str.contains("Candidatus")]
subsample_genus = subsample_genus[~subsample_genus.taxa.str.contains("candidate")]

In [16]:
subsample_genus = subsample_genus.groupby(['taxa']).sum().T

In [17]:
subsample_genus = subsample_genus.reset_index()

In [18]:
subsample_genus.rename(columns={"index": "uid"}, inplace=True)

In [19]:
split_list = subsample_genus["uid"].str.split('_')
subsample_genus["sampleid"] = [string[0] for string in split_list]
subsample_genus["read_depth"] = [string[1] for string in split_list]
subsample_genus['read_depth'] = subsample_genus['read_depth'].map(lambda x: x.lstrip('+-').rstrip('k')) # remove k


In [98]:
subsample_genus["sampleid"] = subsample_genus["sampleid"].astype(str)

In [99]:
subsample_genus.head()

taxa,uid,Absiella,Acidaminococcus,Acinetobacter,Actinobaculum,Actinomyces,Actinotignum,Adlercreutzia,Aeriscardovia,Aeromonas,...,Turicibacter,Turicimonas,Tyzzerella,Varibaculum,Veillonella,Victivallis,Weissella,Yersinia,sampleid,read_depth
0,G90147_750k_profile,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,34.21486,0.0,0.0,0.0,G90147,750
1,G90147_500k_profile,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,32.98802,0.0,0.0,0.0,G90147,500
2,G90147_250k_profile,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,34.80456,0.0,0.0,0.0,G90147,250
3,G90147_100k_profile,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,37.79919,0.0,0.0,0.0,G90147,100
4,G90147_1000k_profile,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,34.64051,0.0,0.0,0.0,G90147,1000


### Adding metadata

In [21]:
# adding age metadata
metadata = pd.read_csv("~/Documents/thesis/paper-diabimmune-method-comparison/diabimmune_metadata.csv", index_col = 0).T

In [22]:
metadata.rename(columns={"gid_16s": "sampleid", 'mgx_reads':"read_depth"}, inplace=True)

In [23]:
metadata["age_at_collection"] = metadata['age_at_collection'].astype(str).astype(int)

In [97]:
metadata["sampleid"] = metadata["sampleid"].astype(str)

In [24]:
# add dev_stage categories

bins = [0, 450, 900, np.inf]
names = ['less than 15 months', '15 to 30 months', 'older than 30 months']

metadata['dev_stage'] = pd.cut(metadata['age_at_collection'], bins, labels=names)

In [29]:
metadata.head()

Unnamed: 0,subjectID,SampleID,age_at_collection,collection_month,delivery,gest_time,gender,country,Exclusive_breast_feeding,Breast_feeding_end,...,allergy_birch,allergy_timothy,gid_wgs,read_depth,mgx_pool,mgx_reads_filtered,read_count_16S,sequencing_PDO_16S,sampleid,dev_stage
5,E002338,3101193,304,10,vaginal,285,Female,FIN,False,False,...,False,False,G80541,28876314,Plate 7,28.568962,15870,PDO-4356,G75694,less than 15 months
6,E002338,3107294,493,16,vaginal,285,Female,FIN,False,True,...,False,False,G80537,29615868,Plate 7,29.542272,13609,PDO-4356,G75855,15 to 30 months
7,E002338,3113022,852,28,vaginal,285,Female,FIN,False,True,...,False,False,G80322,30478184,Plate 8,30.41078,34620,PDO-4157,G73882,15 to 30 months
8,E002338,3107293,399,13,vaginal,285,Female,FIN,False,False,...,False,False,G80513,28503488,Plate 7,28.360762,7337,PDO-4356,G75849,less than 15 months
15,E002338,3101190,212,7,vaginal,285,Female,FIN,False,False,...,False,False,,23928698,Diab Plate 9,23.824676,18931,PDO-4356,G75788,less than 15 months


In [100]:
age_dict = pd.Series(metadata.age_at_collection.values,index=metadata.sampleid).to_dict()

In [67]:
'G69256' in metadata.sampleid.unique()

False

In [87]:
len(mgx_genus.sampleid.unique())

785

In [105]:
no_id = [microbe for microbe in list(subsample_genus.sampleid.unique()) if microbe not in list(metadata.sampleid.unique())]

In [106]:
len(no_id)

811

In [107]:
len(subsample_genus.sampleid.unique())

812

In [39]:
len(subsample_genus.sampleid.unique())

812

In [108]:
# make read_Depth dictionary
depth_dict = {str(s): {} for s in metadata["sampleid"]}
for index, row in metadata.iterrows():
    age_months = row["read_depth"]
    agedict[row["sampleid"]]= age_months

In [109]:
# dev_stage dictionary
dev_stage_dict = {str(s): {} for s in metadata["dev_stage"]}
for index, row in metadata.iterrows():
    stage = row["dev_stage"]
    dev_stage_dict[row["sampleid"]] = stage

In [110]:
mgx_genus["AgeMonths"]= mgx_genus["sampleid"].map(age_dict)
mgx_genus["dev_stage"]= mgx_genus["sampleid"].map(dev_stage_dict)
mgx_genus["sampling_cat"] = "original depth"

In [111]:
mgx_genus.head()

taxa,sampleid,Abiotrophia,Acidaminococcus,Acinetobacter,Actinobacillus,Actinobaculum,Actinomyces,Adlercreutzia,Aeromonas,Aggregatibacter,...,Tannerella,Turicibacter,Varibaculum,Variovorax,Veillonella,Weissella,Yersinia,AgeMonths,dev_stage,sampling_cat
0,G69146,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.07238,0.0,0.0,,,original depth
1,G69147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.98064,0.0,0.0,,,original depth
2,G69148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.07409,0.0,0.0,,,original depth
3,G69149,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00741,0.0,0.0,,,original depth
4,G69150,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.07214,0.0,0.0,,,original depth


In [None]:
subsample_reshaped["AgeMonths"]= subsample_reshaped["sampleid"].map(age_dict)
subsample_reshaped["dev_stage"]= subsample_reshaped["sampleid"].map(dev_stage_dict)
subsample_reshaped["sampling_cat"] = subsample_reshaped["read_depth"]

In [None]:
subsample_reshaped.head(15)

In [None]:
concat_df = pd.concat([mgx_genus,subsample_reshaped], sort=True).reset_index(drop = True)

In [None]:
concat_df = concat_df.fillna(0) 

In [None]:
cols_to_order = ['uid', 'sampleid', 'read_depth', 'AgeMonths', 'dev_stage', 'sampling_cat']
new_columns = cols_to_order + (concat_df.columns.drop(cols_to_order).tolist())
concat_df = concat_df[new_columns]

In [None]:
concat_df.head(15)

In [None]:
concat_df.to_csv('diabimmune_subsampled_df.csv', index = False)