In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import csv
import janitor
import numpy as np
import matplotlib.pyplot as plt

import os
import glob

In [2]:
# read in original data

path = '/Users/danielle/Documents/thesis/subsampled_analysis/original_resampling/*'                     
all_files = [name for name in glob.glob(path)]   

In [3]:
df_from_each_file = []

for f in all_files:
    df = pd.read_csv(f, sep = '\t') # read in dataframe 
    
    f = f.replace("_",'-')
    
    id_1 = f.split('original-resampling/')[1] # add sample id from filename 
    id_2 = id_1.split('-S')[0]
    df["sampleid"] = id_2
    
    df.rename(columns = {'#SampleID':'taxa', 'Metaphlan2_Analysis':'abundance'}, inplace = True) 
    
    df = df[df['taxa'].str.contains("\|s__")] # only keep genera
    df = df[~df['taxa'].str.contains("\|t__")] 
    df["taxa"] = df['taxa'].str.split("\|g__").str[-1]
    df["taxa"] = df['taxa'].str.split("\|s__").str[0]
    
    # remove unclassified
    df = df[~df.taxa.str.contains("_unclassified")]
    df = df[~df.taxa.str.contains("_noname")]
    
    # convert to relative abundance
    df["abundance"] = df["abundance"]/100.0
    
    df_from_each_file.append(df)

In [4]:
original = pd.concat(df_from_each_file, ignore_index=True) # concat all dataframes together

In [5]:
original_reshaped = original.pivot_table(index = "sampleid", values="abundance", columns = "taxa") # pivot

In [6]:
original_reshaped.head()

taxa,Acidaminococcus,Acinetobacter,Actinobacillus,Actinomyces,Adlercreutzia,Akkermansia,Alistipes,Alloprevotella,Anaerococcus,Anaerofustis,...,Ruminococcus,Shigella,Slackia,Staphylococcus,Streptococcus,Subdoligranulum,Sutterella,Turicibacter,Varibaculum,Veillonella
sampleid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0005-3F-1A,,,,,0.000395,,0.011947,,0.000122,,...,0.021263,,,,0.000731,0.038609,0.007346,0.000521,,
C0016-3F-1A,,,,,0.000187,0.0025,0.00995,,,,...,0.012197,,,,0.003032,0.06576,0.008949,,,0.00043
C0017-2F-1A,,,,,,0.024144,0.015922,,,,...,0.003829,,,,0.001998,0.022731,0.019241,,,0.000475
C0029-6F-1A,,,,0.000123,0.002727,0.023825,0.029313,,,,...,0.027507,,,,,0.049503,,,,
C0032-9F-1A,,,,,0.002953,,0.017328,,,,...,0.063848,,,,0.021758,0.080632,,,,


In [7]:
original_reshaped.columns.name = None

In [8]:
original_reshaped = original_reshaped.rename_axis(None, axis=1).reset_index()

In [9]:
original_reshaped = original_reshaped.fillna(0)  # fill in missing values with 0

In [10]:
original_reshaped["uid"] = original_reshaped["sampleid"].astype(str)+'-original'# add unique identifier

In [11]:
original_reshaped.head(15)

Unnamed: 0,sampleid,Acidaminococcus,Acinetobacter,Actinobacillus,Actinomyces,Adlercreutzia,Akkermansia,Alistipes,Alloprevotella,Anaerococcus,...,Shigella,Slackia,Staphylococcus,Streptococcus,Subdoligranulum,Sutterella,Turicibacter,Varibaculum,Veillonella,uid
0,C0005-3F-1A,0.0,0.0,0.0,0.0,0.000395,0.0,0.011947,0.0,0.000122,...,0.0,0.0,0.0,0.000731,0.038609,0.007346,0.000521,0.0,0.0,C0005-3F-1A-original
1,C0016-3F-1A,0.0,0.0,0.0,0.0,0.000187,0.0025,0.00995,0.0,0.0,...,0.0,0.0,0.0,0.003032,0.06576,0.008949,0.0,0.0,0.00043,C0016-3F-1A-original
2,C0017-2F-1A,0.0,0.0,0.0,0.0,0.0,0.024144,0.015922,0.0,0.0,...,0.0,0.0,0.0,0.001998,0.022731,0.019241,0.0,0.0,0.000475,C0017-2F-1A-original
3,C0029-6F-1A,0.0,0.0,0.0,0.000123,0.002727,0.023825,0.029313,0.0,0.0,...,0.0,0.0,0.0,0.0,0.049503,0.0,0.0,0.0,0.0,C0029-6F-1A-original
4,C0032-9F-1A,0.0,0.0,0.0,0.0,0.002953,0.0,0.017328,0.0,0.0,...,0.0,0.0,0.0,0.021758,0.080632,0.0,0.0,0.0,0.0,C0032-9F-1A-original
5,C0043-7F-1A,0.0,0.0,0.0,0.0,0.000359,0.003434,0.009745,0.0,0.0,...,0.0,0.0,0.0,0.013986,0.025354,0.0,0.0,0.0,0.000952,C0043-7F-1A-original
6,C0047-7F-1A,0.0,0.0,0.0,0.0,0.000337,0.001211,0.00708,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01048,0.010724,0.0,0.0,0.0,C0047-7F-1A-original
7,C0052-5F-1A,0.0,0.0,0.0,0.0,0.00045,7.6e-05,0.006235,0.0,0.0,...,0.0,0.0,0.0,0.002575,0.05411,0.0,0.0,0.0,0.00021,C0052-5F-1A-original
8,C0053-6F-1A,0.0,0.0,0.0,0.0,0.000239,0.0,0.000207,0.0,0.0,...,0.0,0.0,0.0,0.00777,0.00502,0.0,0.0,0.0,0.003054,C0053-6F-1A-original
9,C0055-3F-1A,0.0,0.0,0.0,0.0,0.002635,0.0,0.044574,0.0,0.0,...,0.0,0.010557,0.0,0.001065,0.028274,0.0,0.0,0.0,0.0,C0055-3F-1A-original


In [12]:
subsample = pd.read_csv("~/Documents/thesis/subsampled_analysis/merged_abundance_table.tsv", sep='\t')

In [13]:
subsample["ID"][8]

'k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanobrevibacter|s__Methanobrevibacter_unclassified'

In [14]:
# only keep genera
subsample = subsample[subsample['ID'].str.contains("\|g__")]
subsample = subsample[~subsample['ID'].str.contains("\|s__")]
subsample = subsample[~subsample['ID'].str.contains("\|t__")]

In [15]:
# cleaning genera name
subsample["ID"] = subsample['ID'].str.split("\|g__").str[-1]
subsample["ID"] = subsample['ID'].str.split("\|s__").str[0]

In [16]:
subsample.head()

Unnamed: 0,ID,C0005_3F_1A_1000k_1_profile,C0005_3F_1A_1000k_2_profile,C0005_3F_1A_1000k_3_profile,C0005_3F_1A_1000k_4_profile,C0005_3F_1A_100k_1_profile,C0005_3F_1A_100k_2_profile,C0005_3F_1A_100k_3_profile,C0005_3F_1A_100k_4_profile,C0005_3F_1A_10k_1_profile,...,C0785_1F_1A_1000k_3_profile,C0785_1F_1A_1000k_4_profile,C0785_1F_1A_100k_1_profile,C0785_1F_1A_100k_2_profile,C0785_1F_1A_100k_3_profile,C0785_1F_1A_100k_4_profile,C0785_1F_1A_10k_1_profile,C0785_1F_1A_10k_2_profile,C0785_1F_1A_10k_3_profile,C0785_1F_1A_10k_4_profile
5,Methanobrevibacter,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,Actinomyces,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,Varibaculum,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22,Bifidobacterium,3.37949,3.87518,3.42203,3.48315,2.55393,0.81837,3.46041,2.23005,0.0,...,6.3615,6.66787,4.19725,0.44341,2.20783,2.71964,0.0,0.0,0.0,0.0
41,Adlercreutzia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
subsample_transposed = subsample.set_index('ID').transpose()

In [18]:
subsample_transposed.reset_index(level=0, inplace=True)

In [19]:
subsample_transposed.rename(columns = {'index':'sampleid'}, inplace = True) 

In [20]:
subsample_transposed.head()

ID,sampleid,Methanobrevibacter,Actinomyces,Varibaculum,Bifidobacterium,Adlercreutzia,Atopobium,Collinsella,Eggerthella,Slackia,...,Morganella,Proteus,Shigella,Actinobacillus,Haemophilus,Akkermansia,Mulikevirus,PhiCD119likevirus,Epsilon15likevirus,C2likevirus
0,C0005_3F_1A_1000k_1_profile,0.0,0.0,0.0,3.37949,0.0,0.0,1.46211,0.47765,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,C0005_3F_1A_1000k_2_profile,0.0,0.0,0.0,3.87518,0.0,0.0,1.55445,0.48849,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,C0005_3F_1A_1000k_3_profile,0.0,0.0,0.0,3.42203,0.0,0.0,1.54075,0.46666,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,C0005_3F_1A_1000k_4_profile,0.0,0.0,0.0,3.48315,0.0,0.0,1.6961,0.68797,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,C0005_3F_1A_100k_1_profile,0.0,0.0,0.0,2.55393,0.0,0.0,0.55399,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# melt dataframes
subsample_melt = pd.melt(subsample, id_vars=["ID"], var_name = "sampleid", value_name = "abund")

In [22]:
subsample_melt.sample(5)

Unnamed: 0,ID,sampleid,abund
20613,Ruminococcaceae_noname,C0556_2F_1A_1000k_3_profile,0.0
11627,Bacteroides,C0482_3F_1A_1000k_2_profile,24.23031
22327,Fusobacterium,C0595_1F_1A_1000k_2_profile,0.0
13432,Peptostreptococcaceae_noname,C0494_2F_1A_1000k_2_profile,0.63267
24031,Parasutterella,C0603_1F_1A_1000k_1_profile,0.0


In [23]:
subsample_melt["replicate"] = subsample_melt['sampleid'].str.split("k_").str[-1]
subsample_melt["replicate"] = subsample_melt['replicate'].str.split("_profile").str[0]

In [24]:
subsample_melt["read_depth"] = subsample_melt['sampleid'].str.split("1A_").str[-1]
subsample_melt["read_depth"] = subsample_melt['read_depth'].str.split("k_").str[0]

In [25]:
subsample_melt.to_csv('subsample_cleaned.csv')

In [26]:
# add metadata to transposed dataframe
subsample_transposed["replicate"] = subsample_transposed['sampleid'].str.split("k_").str[-1]
subsample_transposed["replicate"] = subsample_transposed['replicate'].str.split("_profile").str[0]
subsample_transposed["read_depth"] = subsample_transposed['sampleid'].str.split("1A_").str[-1]
subsample_transposed["read_depth"] =subsample_transposed['read_depth'].str.split("k_").str[0]


In [27]:
subsample_transposed['read_depth'] = (subsample_transposed['read_depth'].astype(str).astype(int))*1000.0

In [28]:
subsample_transposed.head()

ID,sampleid,Methanobrevibacter,Actinomyces,Varibaculum,Bifidobacterium,Adlercreutzia,Atopobium,Collinsella,Eggerthella,Slackia,...,Shigella,Actinobacillus,Haemophilus,Akkermansia,Mulikevirus,PhiCD119likevirus,Epsilon15likevirus,C2likevirus,replicate,read_depth
0,C0005_3F_1A_1000k_1_profile,0.0,0.0,0.0,3.37949,0.0,0.0,1.46211,0.47765,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1000000.0
1,C0005_3F_1A_1000k_2_profile,0.0,0.0,0.0,3.87518,0.0,0.0,1.55445,0.48849,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,1000000.0
2,C0005_3F_1A_1000k_3_profile,0.0,0.0,0.0,3.42203,0.0,0.0,1.54075,0.46666,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,1000000.0
3,C0005_3F_1A_1000k_4_profile,0.0,0.0,0.0,3.48315,0.0,0.0,1.6961,0.68797,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,1000000.0
4,C0005_3F_1A_100k_1_profile,0.0,0.0,0.0,2.55393,0.0,0.0,0.55399,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,100000.0


In [29]:
# fix '_' in sampleid, sample names
subsample_transposed["sampleid"] = subsample_transposed["sampleid"].str.replace("_",'-')

In [30]:
subsample_transposed["uid"] = subsample_transposed["sampleid"]


In [31]:
subsample_transposed["sampleid"] = subsample_transposed["sampleid"].str.split("-10").str[0]

In [32]:
subsample_transposed.drop(columns=['replicate'], inplace= True)

In [33]:
subsample_transposed.head()

ID,sampleid,Methanobrevibacter,Actinomyces,Varibaculum,Bifidobacterium,Adlercreutzia,Atopobium,Collinsella,Eggerthella,Slackia,...,Shigella,Actinobacillus,Haemophilus,Akkermansia,Mulikevirus,PhiCD119likevirus,Epsilon15likevirus,C2likevirus,read_depth,uid
0,C0005-3F-1A,0.0,0.0,0.0,3.37949,0.0,0.0,1.46211,0.47765,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1000000.0,C0005-3F-1A-1000k-1-profile
1,C0005-3F-1A,0.0,0.0,0.0,3.87518,0.0,0.0,1.55445,0.48849,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1000000.0,C0005-3F-1A-1000k-2-profile
2,C0005-3F-1A,0.0,0.0,0.0,3.42203,0.0,0.0,1.54075,0.46666,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1000000.0,C0005-3F-1A-1000k-3-profile
3,C0005-3F-1A,0.0,0.0,0.0,3.48315,0.0,0.0,1.6961,0.68797,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1000000.0,C0005-3F-1A-1000k-4-profile
4,C0005-3F-1A,0.0,0.0,0.0,2.55393,0.0,0.0,0.55399,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100000.0,C0005-3F-1A-100k-1-profile


In [34]:
# mgx data
original = pd.read_csv("~/Documents/thesis/analysis/mgx_abundance.csv")

In [35]:
original_transposed = original.set_index("taxa").transpose()
original_transposed.reset_index(level=0, inplace=True)
original_transposed.rename(columns = {'index':'sampleid'}, inplace = True) 
original_transposed["uid"] = original_transposed["sampleid"].astype(str)+'-original'

In [36]:
# adding age metadata
age = pd.read_csv("~/Documents/thesis/theoretical/sorted_babies.csv", index_col = 0)

In [37]:
age.head()

Unnamed: 0,sample,subject,timepoint,correctedAgeDays,shannon,reads,AgeMonths,dev_stage,color
1,C0005_3F_1A,5,3,4505.0,3.379592,6193602.0,150.166667,older than 30 months,yellow
2,C0016_3F_1A,16,3,2987.0,3.162439,7626286.0,99.566667,older than 30 months,yellow
3,C0016_4F_1A,16,4,3398.0,3.415994,5497174.0,113.266667,older than 30 months,yellow
4,C0017_2F_1A,17,2,3836.0,3.102341,6240254.0,127.866667,older than 30 months,yellow
5,C0017_3F_1A,17,3,4247.0,3.460197,7764432.0,141.566667,older than 30 months,yellow


In [38]:
# remove shannon column, change characters in sample names 
age.drop(columns=['shannon'], inplace= True)
age["sample"] = age["sample"].str.replace("_",'-')
age.rename(columns = {'sample':'sampleid', "reads":"read_depth"}, inplace = True) 

In [39]:
age.head(15)

Unnamed: 0,sampleid,subject,timepoint,correctedAgeDays,read_depth,AgeMonths,dev_stage,color
1,C0005-3F-1A,5,3,4505.0,6193602.0,150.166667,older than 30 months,yellow
2,C0016-3F-1A,16,3,2987.0,7626286.0,99.566667,older than 30 months,yellow
3,C0016-4F-1A,16,4,3398.0,5497174.0,113.266667,older than 30 months,yellow
4,C0017-2F-1A,17,2,3836.0,6240254.0,127.866667,older than 30 months,yellow
5,C0017-3F-1A,17,3,4247.0,7764432.0,141.566667,older than 30 months,yellow
6,C0029-6F-1A,29,6,3478.0,5641016.0,115.933333,older than 30 months,yellow
7,C0032-9F-1A,32,9,3084.0,7575450.0,102.8,older than 30 months,yellow
8,C0043-7F-1A,43,7,2598.0,3498886.0,86.6,older than 30 months,yellow
9,C0043-8F-1A,43,8,3059.0,8846224.0,101.966667,older than 30 months,yellow
10,C0047-7F-1A,47,7,2917.0,6353788.0,97.233333,older than 30 months,yellow


In [40]:
# make age dictionary
agedict = {str(s): {} for s in age["sampleid"]}
for index, row in age.iterrows():
    age_months = row["AgeMonths"]
    agedict[row["sampleid"]]= age_months

In [41]:
# read_depth dictionary
readdict = {str(s): {} for s in age["sampleid"]}
for index, row in age.iterrows():
    reads = row["read_depth"]
    readdict[row["sampleid"]]= reads

In [42]:
# dev_stage dictionary
dev_stage_dict = {str(s): {} for s in age["dev_stage"]}
for index, row in age.iterrows():
    stage = row["dev_stage"]
    dev_stage_dict[row["sampleid"]] = stage

In [43]:
original_reshaped["AgeMonths"]= original_reshaped["sampleid"].map(agedict)
original_reshaped["read_depth"]= original_reshaped["sampleid"].map(readdict)
original_reshaped["dev_stage"]= original_reshaped["sampleid"].map(dev_stage_dict)
original_reshaped["sampling_cat"] = "original depth"

In [44]:
original_reshaped.head(15)

Unnamed: 0,sampleid,Acidaminococcus,Acinetobacter,Actinobacillus,Actinomyces,Adlercreutzia,Akkermansia,Alistipes,Alloprevotella,Anaerococcus,...,Subdoligranulum,Sutterella,Turicibacter,Varibaculum,Veillonella,uid,AgeMonths,read_depth,dev_stage,sampling_cat
0,C0005-3F-1A,0.0,0.0,0.0,0.0,0.000395,0.0,0.011947,0.0,0.000122,...,0.038609,0.007346,0.000521,0.0,0.0,C0005-3F-1A-original,150.166667,6193602.0,older than 30 months,original depth
1,C0016-3F-1A,0.0,0.0,0.0,0.0,0.000187,0.0025,0.00995,0.0,0.0,...,0.06576,0.008949,0.0,0.0,0.00043,C0016-3F-1A-original,99.566667,7626286.0,older than 30 months,original depth
2,C0017-2F-1A,0.0,0.0,0.0,0.0,0.0,0.024144,0.015922,0.0,0.0,...,0.022731,0.019241,0.0,0.0,0.000475,C0017-2F-1A-original,127.866667,6240254.0,older than 30 months,original depth
3,C0029-6F-1A,0.0,0.0,0.0,0.000123,0.002727,0.023825,0.029313,0.0,0.0,...,0.049503,0.0,0.0,0.0,0.0,C0029-6F-1A-original,115.933333,5641016.0,older than 30 months,original depth
4,C0032-9F-1A,0.0,0.0,0.0,0.0,0.002953,0.0,0.017328,0.0,0.0,...,0.080632,0.0,0.0,0.0,0.0,C0032-9F-1A-original,102.8,7575450.0,older than 30 months,original depth
5,C0043-7F-1A,0.0,0.0,0.0,0.0,0.000359,0.003434,0.009745,0.0,0.0,...,0.025354,0.0,0.0,0.0,0.000952,C0043-7F-1A-original,86.6,3498886.0,older than 30 months,original depth
6,C0047-7F-1A,0.0,0.0,0.0,0.0,0.000337,0.001211,0.00708,0.0,0.0,...,0.01048,0.010724,0.0,0.0,0.0,C0047-7F-1A-original,97.233333,6353788.0,older than 30 months,original depth
7,C0052-5F-1A,0.0,0.0,0.0,0.0,0.00045,7.6e-05,0.006235,0.0,0.0,...,0.05411,0.0,0.0,0.0,0.00021,C0052-5F-1A-original,94.133333,7582006.0,older than 30 months,original depth
8,C0053-6F-1A,0.0,0.0,0.0,0.0,0.000239,0.0,0.000207,0.0,0.0,...,0.00502,0.0,0.0,0.0,0.003054,C0053-6F-1A-original,72.5,7101552.0,older than 30 months,original depth
9,C0055-3F-1A,0.0,0.0,0.0,0.0,0.002635,0.0,0.044574,0.0,0.0,...,0.028274,0.0,0.0,0.0,0.0,C0055-3F-1A-original,114.733333,4922724.0,older than 30 months,original depth


In [45]:
subsample_transposed["AgeMonths"]= subsample_transposed["sampleid"].map(agedict)
subsample_transposed["dev_stage"]= subsample_transposed["sampleid"].map(dev_stage_dict)
subsample_transposed["sampling_cat"] = subsample_transposed["read_depth"]

In [46]:
subsample_transposed.head(15)

ID,sampleid,Methanobrevibacter,Actinomyces,Varibaculum,Bifidobacterium,Adlercreutzia,Atopobium,Collinsella,Eggerthella,Slackia,...,Akkermansia,Mulikevirus,PhiCD119likevirus,Epsilon15likevirus,C2likevirus,read_depth,uid,AgeMonths,dev_stage,sampling_cat
0,C0005-3F-1A,0.0,0.0,0.0,3.37949,0.0,0.0,1.46211,0.47765,0.0,...,0.0,0.0,0.0,0.0,0.0,1000000.0,C0005-3F-1A-1000k-1-profile,150.166667,older than 30 months,1000000.0
1,C0005-3F-1A,0.0,0.0,0.0,3.87518,0.0,0.0,1.55445,0.48849,0.0,...,0.0,0.0,0.0,0.0,0.0,1000000.0,C0005-3F-1A-1000k-2-profile,150.166667,older than 30 months,1000000.0
2,C0005-3F-1A,0.0,0.0,0.0,3.42203,0.0,0.0,1.54075,0.46666,0.0,...,0.0,0.0,0.0,0.0,0.0,1000000.0,C0005-3F-1A-1000k-3-profile,150.166667,older than 30 months,1000000.0
3,C0005-3F-1A,0.0,0.0,0.0,3.48315,0.0,0.0,1.6961,0.68797,0.0,...,0.0,0.0,0.0,0.0,0.0,1000000.0,C0005-3F-1A-1000k-4-profile,150.166667,older than 30 months,1000000.0
4,C0005-3F-1A,0.0,0.0,0.0,2.55393,0.0,0.0,0.55399,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,100000.0,C0005-3F-1A-100k-1-profile,150.166667,older than 30 months,100000.0
5,C0005-3F-1A,0.0,0.0,0.0,0.81837,0.0,0.0,1.78702,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,100000.0,C0005-3F-1A-100k-2-profile,150.166667,older than 30 months,100000.0
6,C0005-3F-1A,0.0,0.0,0.0,3.46041,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,100000.0,C0005-3F-1A-100k-3-profile,150.166667,older than 30 months,100000.0
7,C0005-3F-1A,0.0,0.0,0.0,2.23005,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,100000.0,C0005-3F-1A-100k-4-profile,150.166667,older than 30 months,100000.0
8,C0005-3F-1A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10000.0,C0005-3F-1A-10k-1-profile,150.166667,older than 30 months,10000.0
9,C0005-3F-1A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10000.0,C0005-3F-1A-10k-2-profile,150.166667,older than 30 months,10000.0


In [47]:
concat_df = pd.concat([original_reshaped,subsample_transposed], sort=True).reset_index(drop = True)

In [48]:
cols_to_order = ['uid', 'sampleid', 'read_depth', 'AgeMonths', 'dev_stage', 'sampling_cat']
new_columns = cols_to_order + (concat_df.columns.drop(cols_to_order).tolist())
concat_df = concat_df[new_columns]

In [49]:
# remove mothers
# concat_df = concat_df[~concat_df.sampleid.str.contains("M")]

In [50]:
concat_df.sample(15)

Unnamed: 0,uid,sampleid,read_depth,AgeMonths,dev_stage,sampling_cat,Acidaminococcus,Acinetobacter,Actinobacillus,Actinomyces,...,Shigella,Slackia,Staphylococcus,Streptococcus,Subdoligranulum,Sutterella,Sutterellaceae_unclassified,Turicibacter,Varibaculum,Veillonella
290,C0537-1F-1A-10k-1-profile,C0537-1F-1A,10000.0,14.866667,less than 15 months,10000,0.0,,0.0,0.0,...,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75,C0029-6F-1A-10k-2-profile,C0029-6F-1A,10000.0,115.933333,older than 30 months,10000,0.0,,0.0,0.0,...,0.0,0.0,,0.0,100.0,0.0,0.0,0.0,0.0,0.0
95,C0043-7F-1A-100k-2-profile,C0043-7F-1A,100000.0,86.6,older than 30 months,100000,0.0,,0.0,0.0,...,0.0,0.0,,2.87659,0.0,0.0,0.0,0.0,0.0,0.0
328,C0557-1F-1A-10k-3-profile,C0557-1F-1A,10000.0,13.733333,less than 15 months,10000,0.0,,0.0,0.0,...,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
356,C0603-1F-1A-1000k-3-profile,C0603-1F-1A,1000000.0,1.133333,less than 15 months,1e+06,0.0,,0.0,0.0,...,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.03793,5.5439
157,C0420-4F-1A-100k-4-profile,C0420-4F-1A,100000.0,26.6,15 to 30 months,100000,0.0,,0.0,0.0,...,0.0,0.0,,0.0,15.07653,0.0,0.0,0.0,0.0,0.0
46,C0016-3F-1A-100k-1-profile,C0016-3F-1A,100000.0,99.566667,older than 30 months,100000,0.0,,0.0,0.0,...,0.0,0.0,,0.0,17.57007,0.0,0.0,0.0,0.0,0.0
0,C0005-3F-1A-original,C0005-3F-1A,6193602.0,150.166667,older than 30 months,original depth,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000731,0.038609,0.007346,,0.000521,0.0,0.0
165,C0476-3F-1A-1000k-4-profile,C0476-3F-1A,1000000.0,24.166667,15 to 30 months,1e+06,0.0,,0.0,0.0,...,0.0,0.0,,2.57874,2.26099,0.0,0.0,0.0,0.0,0.0
192,C0482-3F-1A-100k-3-profile,C0482-3F-1A,100000.0,25.8,15 to 30 months,100000,0.0,,0.0,0.0,...,0.0,0.0,,0.0,13.71635,0.0,0.0,0.0,0.0,0.0


In [51]:
concat_df.to_csv('subsampled_df.csv')