In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import csv
import janitor
import numpy as np
import os
import glob

In [2]:
# read in original data

path = '/Users/danielle/Documents/thesis/paper-abundance-tables/profiles/*'                     
all_files = [name for name in glob.glob(path)]  

In [3]:
df_from_each_file = []
for f in all_files:
    df = pd.read_csv(f, sep = '\t', skiprows=[0,1,2], usecols = ['#clade_name','relative_abundance']) # read in dataframe 
    
    id_1 = f.split('profiles/')[1] # add sample id from filename 
    id_2 = id_1.split('_S')[0]
    df["sampleid"] = id_2
    
    df.rename(columns = {'#clade_name':'taxa', 'relative_abundance':'abundance'}, inplace = True) 
    
    df = df[df['taxa'].str.contains("\|s__")] # only keep genera
    df = df[~df['taxa'].str.contains("\|t__")] 
    df["taxa"] = df['taxa'].str.split("\|g__").str[-1]
    df["taxa"] = df['taxa'].str.split("\|s__").str[0]
    
    # remove unclassified
    df = df[~df.taxa.str.contains("_unclassified")]
    df = df[~df.taxa.str.contains("_noname")]
    df = df[~df.taxa.str.contains("Candidatus")]
    
    # combine together taxa of the same genera
    df = df.groupby(['taxa', 'sampleid'])['abundance'].sum().reset_index()    
    
    # convert to relative abundance
    df["abundance"] = df["abundance"]/100.0
    
    df_from_each_file.append(df)

In [4]:
mgx = pd.concat(df_from_each_file, ignore_index=True) # concat all dataframes together

In [5]:
mgx.head()

Unnamed: 0,taxa,sampleid,abundance
0,Agathobaculum,C0047-7E-1A,0.001349
1,Alistipes,C0047-7E-1A,0.036088
2,Anaerostipes,C0047-7E-1A,0.010047
3,Asaccharobacter,C0047-7E-1A,1.2e-05
4,Bacteroides,C0047-7E-1A,0.65244


In [6]:
mgx_reshaped = mgx.pivot_table(index = "sampleid", values="abundance", columns = "taxa") # pivot

In [7]:
mgx_reshaped.columns.name = None

In [8]:
mgx_reshaped = mgx_reshaped.rename_axis(None, axis=1).reset_index()

In [9]:
mgx_reshaped = mgx_reshaped.fillna(0)  # fill in missing values with 0

In [10]:
mgx_reshaped

Unnamed: 0,sampleid,Absiella,Acetobacter,Acidaminococcus,Acinetobacter,Actinomyces,Actinotignum,Adlercreutzia,Aeriscardovia,Aeromonas,...,Stenotrophomonas,Streptococcus,Sutterella,Terrisporobacter,Turicibacter,Turicimonas,Tyzzerella,Varibaculum,Veillonella,Victivallis
0,C0047-7E-1A,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.000000,0.000156,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,C0052-7E-1A,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.001675,0.000000,0.0,0.002813,0.000000,0.000000,0.000000,0.000000,0.000000
2,C0053-8E-1A,0.0,0.0,0.0,0.000000,0.001754,0.000000,0.000623,0.000000,0.0,...,0.0,0.008166,0.000000,0.0,0.000670,0.000062,0.001062,0.000000,0.000000,0.000000
3,C0058-4E-1A,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.003549,0.000000,0.0,0.002141,0.000000,0.000000,0.000000,0.001234,0.000000
4,C0059-4E-1A,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.005600,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,C0106-6E-1A,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000396,0.000000,0.0,...,0.0,0.035666,0.000000,0.0,0.002481,0.000000,0.000029,0.000000,0.000235,0.000000
6,C0107-4E-1A,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000386,0.000000,0.0,...,0.0,0.008849,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,C0123-3E-1A,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000380,0.000000,0.0,...,0.0,0.002299,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,C0126-9E-1A,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000268,0.000000,0.0,...,0.0,0.010257,0.000000,0.0,0.000000,0.000103,0.000000,0.000000,0.000000,0.000000
9,C0127-7E-1A,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000288,0.000000,0.0,...,0.0,0.006092,0.000000,0.0,0.000000,0.000218,0.000088,0.000000,0.000000,0.000000


In [11]:
# adding age metadata
age = pd.read_csv("/Users/danielle/Documents/thesis/paper-abundance-tables/metadata/joined.csv")

In [12]:
age.head()

Unnamed: 0,sample,subject,timepoint,batch,childGender,correctedAgeDays,mother_HHS,childBMI,hasScan,mcDespotProcessed,...,rt_Pallidum,rt_Hippocampus,rt_Amygdala,rt_Insula,rt_Operculum,hires_total,cerebellar,neocortical,limbic,subcortical
0,C0052_5F_1A,52,5,1,Male,2824.0,6.0,16.728348,1.0,1.0,...,1315.875122,2491.843506,1107.90625,536.9375,570.96875,264479.536024,89286.65625,127223.941542,34246.781738,47968.938232
1,C0126_8F_1A,126,8,1,Male,2261.0,7.0,,1.0,0.0,...,1486.03125,2737.625,1210.000122,680.625,616.34375,320037.446868,100407.316406,163357.565277,41079.502685,56272.565185
2,C0126_8F_1B,126,8,1,Male,2261.0,7.0,,1.0,0.0,...,1486.03125,2737.625,1210.000122,680.625,616.34375,320037.446868,100407.316406,163357.565277,41079.502685,56272.565185
3,C0127_6F_1B,127,6,1,Female,3367.0,7.0,,1.0,1.0,...,1338.5625,2230.9375,1089.0,654.156311,567.1875,270177.880217,86435.597656,135330.938994,34012.343781,48411.343567
4,C0286_5F_1A,286,5,1,Male,1926.0,6.0,,1.0,0.0,...,1603.25,2457.8125,1070.09375,616.34375,548.28125,284418.066527,88447.21875,143566.50244,36700.813782,52404.345337


In [13]:
# change characters in sample names 
age["sample"] = age["sample"].str.replace("_",'-')

In [14]:
age["AgeMonths"] = age["correctedAgeDays"]/30.0
age.rename(columns = {'sample':'sampleid'}, inplace = True) 

In [15]:
# make age dictionary
agedict = {str(s): {} for s in age["sampleid"]}
for index, row in age.iterrows():
    age_months = row["AgeMonths"]
    agedict[row["sampleid"]]= age_months

In [16]:
mgx_reshaped["AgeMonths"]= mgx_reshaped["sampleid"].map(agedict)

In [17]:
def assign_dev_stage(x):
    if x<15:
        return "less than 15 months"
    elif 15 <=x<= 30:
        return "15 to 30 months"
    elif x>30:
        return "older than 30 months"

In [18]:
mgx_reshaped['dev_stage'] = mgx_reshaped['AgeMonths'].apply(assign_dev_stage)

In [19]:
# re-order columns

cols_to_order = ['sampleid','AgeMonths', 'dev_stage']
new_columns = cols_to_order + (mgx_reshaped.columns.drop(cols_to_order).tolist())
mgx_reshaped = mgx_reshaped[new_columns]

In [20]:
mgx_reshaped

Unnamed: 0,sampleid,AgeMonths,dev_stage,Absiella,Acetobacter,Acidaminococcus,Acinetobacter,Actinomyces,Actinotignum,Adlercreutzia,...,Stenotrophomonas,Streptococcus,Sutterella,Terrisporobacter,Turicibacter,Turicimonas,Tyzzerella,Varibaculum,Veillonella,Victivallis
0,C0047-7E-1A,97.233333,older than 30 months,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000156,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,C0052-7E-1A,127.333333,older than 30 months,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.001675,0.000000,0.0,0.002813,0.000000,0.000000,0.000000,0.000000,0.000000
2,C0053-8E-1A,105.700000,older than 30 months,0.0,0.0,0.0,0.000000,0.001754,0.000000,0.000623,...,0.0,0.008166,0.000000,0.0,0.000670,0.000062,0.001062,0.000000,0.000000,0.000000
3,C0058-4E-1A,137.766667,older than 30 months,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.003549,0.000000,0.0,0.002141,0.000000,0.000000,0.000000,0.001234,0.000000
4,C0059-4E-1A,137.766667,older than 30 months,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.005600,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,C0106-6E-1A,115.466667,older than 30 months,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000396,...,0.0,0.035666,0.000000,0.0,0.002481,0.000000,0.000029,0.000000,0.000235,0.000000
6,C0107-4E-1A,115.466667,older than 30 months,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000386,...,0.0,0.008849,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,C0123-3E-1A,93.066667,older than 30 months,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000380,...,0.0,0.002299,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,C0126-9E-1A,88.466667,older than 30 months,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000268,...,0.0,0.010257,0.000000,0.0,0.000000,0.000103,0.000000,0.000000,0.000000,0.000000
9,C0127-7E-1A,125.333333,older than 30 months,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000288,...,0.0,0.006092,0.000000,0.0,0.000000,0.000218,0.000088,0.000000,0.000000,0.000000


In [21]:
mgx_reshaped.to_csv('mgx_abund_df.csv', index=False)

In [22]:
# read in 16S data
amplicon = pd.read_csv("/Users/danielle/Documents/thesis/paper-abundance-tables/dada2_output_exported/feature-table.txt",
                      skiprows=[0], sep='\t')

In [23]:
amplicon.drop(['#OTU ID'], axis = 1, inplace = True)
amplicon.rename(columns = {"taxonomy": "taxa"}, inplace = True)

In [24]:
cols = list(amplicon.columns)
cols = [cols[-1]] + cols[:-1]
amplicon = amplicon[cols]

In [25]:
# clean taxonomic names
# keep only genera
amplicon["taxa"] = amplicon['taxa'].str.split("D_5__", expand=True)[1]

In [26]:
amplicon["taxa"] = amplicon['taxa'].str.split(";", expand=True)[0]

In [27]:
amplicon["taxa"] = amplicon['taxa'].str.split("_", expand=True)[0]

In [28]:
amplicon["taxa"] = amplicon['taxa'].str.split("-", expand=True)[0]

In [29]:
amplicon["taxa"] = amplicon["taxa"].str.strip('[]').astype(str)

In [30]:
amplicon.head()

Unnamed: 0,taxa,C0047-7E-1A,C0052-7E-1A,C0053-8E-1A,C0058-4E-1A,C0059-4E-1A,C0106-6E-1A,C0107-4E-1A,C0123-3E-1A,C0126-9E-1A,...,C1102-1F-1A,C1109-4F-1A,C1115-4F-1A,C1135-4F-1A,C1160-3F-1A,C1177-4F-1A,C1219-3F-1A,C2002-3E-1A,C2014-2E-1A,C2018-3E-1A
0,Bacteroides,0.0,1163.0,0.0,0.0,0.0,0.0,0.0,214.0,1476.0,...,369.0,0.0,0.0,0.0,9.0,336.0,0.0,577.0,0.0,376.0
1,Escherichia,0.0,5.0,42.0,0.0,0.0,7.0,0.0,0.0,0.0,...,4.0,0.0,637.0,593.0,1559.0,0.0,14633.0,18.0,1574.0,13.0
2,Prevotella,0.0,1595.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,...,826.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8823.0
3,Bacteroides,0.0,114.0,0.0,0.0,0.0,0.0,0.0,79.0,533.0,...,420.0,19.0,0.0,32.0,0.0,0.0,1801.0,889.0,0.0,0.0
4,Bacteroides,1289.0,0.0,0.0,2493.0,5670.0,4056.0,1143.0,0.0,0.0,...,831.0,0.0,0.0,0.0,0.0,13995.0,0.0,198.0,0.0,0.0


In [31]:
# combine together taxa of the same genera
amplicon = amplicon.groupby(['taxa']).sum().reset_index()

In [32]:
amplicon.sort_values(by = "taxa", ascending=True, inplace=True)

In [33]:
# calculate relative abundances
bug_sum = amplicon.sum(axis = 0, skipna = True)
amplicon.iloc[:, 1:] = amplicon.iloc[:, 1:]/bug_sum

In [34]:
amplicon_melt = pd.melt(amplicon, id_vars=["taxa"], var_name = "sampleid", value_name = "amplicon_abund")

In [35]:
remove = list(set([name for name in amplicon_melt["taxa"] if len(name)<5]))

In [36]:
remove2 = list(set([name for name in amplicon_melt["taxa"] if not name.isalpha()]))

In [37]:
remove3 = list(set([name for name in amplicon_melt["taxa"] if "Candidatus" in name]))

In [38]:
remove_list = remove + remove2 + remove3 + ["uncultured", "Family", "None", "Clade", "Subgroup"]

In [39]:
amplicon_melt = amplicon_melt[~amplicon_melt.taxa.isin(remove_list)]

In [40]:
amplicon_reshaped = amplicon_melt.pivot_table(index = "sampleid", values="amplicon_abund", columns = "taxa") # pivot

In [41]:
amplicon_reshaped.columns.name = None

In [42]:
amplicon_reshaped = amplicon_reshaped.rename_axis(None, axis=1).reset_index()

In [43]:
amplicon_reshaped = amplicon_reshaped.fillna(0)  # fill in missing values with 0

In [44]:
amplicon_reshaped = amplicon_reshaped.loc[:,~amplicon_reshaped.columns.str.contains(r'[0-9]', case=False)] 

In [45]:
amplicon_reshaped

Unnamed: 0,sampleid,Acetanaerobacterium,Achromobacter,Acidaminococcus,Acinetobacter,Actinobacillus,Actinomyces,Adlercreutzia,Aeromonas,Agathobacter,...,Sulfitobacter,Sutterella,Terrisporobacter,Turicibacter,Tyzzerella,Ulvibacter,Varibaculum,Veillonella,Victivallis,Virgulinella
0,C0047-7E-1A,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.022783,...,0.000000,0.018552,0.000000,0.000271,0.000000,0.000000,0.000000,0.000000,0.0,0.0
1,C0052-7E-1A,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.013665,...,0.000000,0.006443,0.005048,0.008125,0.000000,0.000000,0.000000,0.000657,0.0,0.0
2,C0053-8E-1A,0.000000,0.000000,0.0,0.000000,0.000000,0.000678,0.000000,0.0,0.005422,...,0.000000,0.000000,0.000000,0.001355,0.030497,0.000000,0.000000,0.000387,0.0,0.0
3,C0058-4E-1A,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000273,0.0,0.011691,...,0.000000,0.000000,0.002065,0.006937,0.000000,0.000000,0.000000,0.002650,0.0,0.0
4,C0059-4E-1A,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000403,0.0,0.101117,...,0.000000,0.000000,0.004730,0.002290,0.000000,0.000000,0.000000,0.001359,0.0,0.0
5,C0106-6E-1A,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.107593,...,0.000000,0.004440,0.013568,0.005602,0.003693,0.000000,0.000000,0.000871,0.0,0.0
6,C0107-4E-1A,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000668,0.0,0.022135,...,0.000000,0.009809,0.000000,0.000000,0.000103,0.000000,0.000000,0.000000,0.0,0.0
7,C0123-3E-1A,0.000000,0.000000,0.0,0.000000,0.000000,0.007975,0.000000,0.0,0.011656,...,0.000000,0.000000,0.000000,0.000000,0.003067,0.000000,0.000000,0.025153,0.0,0.0
8,C0126-9E-1A,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.012654,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000148,0.0,0.0
9,C0127-7E-1A,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000181,0.0,0.012624,...,0.000302,0.000000,0.000000,0.000785,0.003141,0.000000,0.000000,0.000000,0.0,0.0


In [46]:
amplicon_reshaped.to_csv('16S_abundance.csv')

### looking for the intersection of taxonomy

In [47]:
amplicon_taxa_list = set(amplicon_reshaped.columns.values)
mgx_taxa_list = set(mgx_reshaped.columns.values)

In [48]:
exclude = ["sampleid", "dev_stage", "AgeMonths"]

In [49]:
intersection = [taxa for taxa in amplicon_taxa_list.intersection(mgx_taxa_list) if taxa not in exclude]
union = [taxa for taxa in amplicon_taxa_list.union(mgx_taxa_list) if taxa not in exclude]

In [50]:
amp_only = [taxa for taxa in amplicon_taxa_list.difference(intersection) if taxa not in exclude]
mgx_only = [taxa for taxa in mgx_taxa_list.difference(intersection) if taxa not in exclude]

In [51]:
len(intersection) # genera found in both
len(amp_only) # genera found in amplicon only
len(mgx_only) # genera found in mgx only

105

63

34

In [52]:
with open("unique_amplicon.txt", "w") as output:
    output.write(str(amp_only))

1068

In [53]:
with open("unique_mgx.txt", "w") as output:
    output.write(str(mgx_only))

557

In [84]:
amp = pd.DataFrame({"amp": amp_only})
mgx = pd.DataFrame({"mgx" : mgx_only})

In [85]:
unique = pd.concat([amp, mgx], axis=1) 

In [88]:
# pd.DataFrame(unique).to_csv('unique_taxa.csv', index=False)

In [54]:
union.sort()

In [55]:
union_df = pd.DataFrame(union)

In [56]:
method_list = []
color_list = []

In [57]:
for taxa in union:
    if taxa in intersection:
        method_list.append("both")
        color_list.append("#9ebcda")
    else:
        if taxa in amp_only:
            method_list.append("amp")
            color_list.append("#fa9fb5")
        elif taxa in mgx_only:
            method_list.append("mgx")
            color_list.append("#7fcdbb")

In [58]:
pd.DataFrame(union).to_csv('/Users/danielle/Documents/thesis/paper-phylogeny/all_taxa.txt', index=False, header=False)

In [78]:
pd.DataFrame(union).to_csv('all_taxa.txt', index=False, header=False)

In [59]:
# union_df["method"] = method_list
union_df["label"] = "label"
union_df["color"] = color_list

In [60]:
# union_df.to_csv('/Users/danielle/Documents/thesis/paper-phylogeny/all_taxa_methods.txt', 
#                 sep='\t', index = False, header=False)

### combine two dataframes, long form


In [61]:
mgx = mgx.fillna(0)  # fill in missing values with 0

In [62]:
merged_taxa = pd.merge(amplicon_melt, mgx, on = ["sampleid", "taxa"], how = "outer").fillna(0)
merged_taxa["mgx_abund"] = merged_taxa["abundance"]

In [63]:
merged_taxa["abs_diff"] = abs(merged_taxa["amplicon_abund"] - merged_taxa["mgx_abund"])
merged_taxa["tot_diff"] = (merged_taxa["amplicon_abund"] - merged_taxa["mgx_abund"])

In [64]:
merged_taxa["AgeMonths"]= merged_taxa["sampleid"].map(agedict)
merged_taxa['dev_stage'] = merged_taxa['AgeMonths'].apply(assign_dev_stage)

In [65]:
# re-order columns

cols_to_order = ['sampleid','AgeMonths', 'dev_stage']
new_columns = cols_to_order + (merged_taxa.columns.drop(cols_to_order).tolist())
merged_taxa = merged_taxa[new_columns]

In [66]:
merged_taxa.to_csv('paper_abund_df.csv', index=False)

## making giant dataframe of abundances

In [67]:
amplicon_reshaped["method"] = "amp"
mgx_reshaped["method"] = "mgx"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [68]:
amplicon_reshaped["AgeMonths"]= amplicon_reshaped["sampleid"].map(agedict)
amplicon_reshaped['dev_stage'] = amplicon_reshaped['AgeMonths'].apply(assign_dev_stage)

In [69]:
amplicon_reshaped

Unnamed: 0,sampleid,Acetanaerobacterium,Achromobacter,Acidaminococcus,Acinetobacter,Actinobacillus,Actinomyces,Adlercreutzia,Aeromonas,Agathobacter,...,Turicibacter,Tyzzerella,Ulvibacter,Varibaculum,Veillonella,Victivallis,Virgulinella,method,AgeMonths,dev_stage
0,C0047-7E-1A,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.022783,...,0.000271,0.000000,0.000000,0.000000,0.000000,0.0,0.0,amp,97.233333,older than 30 months
1,C0052-7E-1A,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.013665,...,0.008125,0.000000,0.000000,0.000000,0.000657,0.0,0.0,amp,127.333333,older than 30 months
2,C0053-8E-1A,0.000000,0.000000,0.0,0.000000,0.000000,0.000678,0.000000,0.0,0.005422,...,0.001355,0.030497,0.000000,0.000000,0.000387,0.0,0.0,amp,105.700000,older than 30 months
3,C0058-4E-1A,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000273,0.0,0.011691,...,0.006937,0.000000,0.000000,0.000000,0.002650,0.0,0.0,amp,137.766667,older than 30 months
4,C0059-4E-1A,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000403,0.0,0.101117,...,0.002290,0.000000,0.000000,0.000000,0.001359,0.0,0.0,amp,137.766667,older than 30 months
5,C0106-6E-1A,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.107593,...,0.005602,0.003693,0.000000,0.000000,0.000871,0.0,0.0,amp,115.466667,older than 30 months
6,C0107-4E-1A,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000668,0.0,0.022135,...,0.000000,0.000103,0.000000,0.000000,0.000000,0.0,0.0,amp,115.466667,older than 30 months
7,C0123-3E-1A,0.000000,0.000000,0.0,0.000000,0.000000,0.007975,0.000000,0.0,0.011656,...,0.000000,0.003067,0.000000,0.000000,0.025153,0.0,0.0,amp,93.066667,older than 30 months
8,C0126-9E-1A,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.012654,...,0.000000,0.000000,0.000000,0.000000,0.000148,0.0,0.0,amp,88.466667,older than 30 months
9,C0127-7E-1A,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000181,0.0,0.012624,...,0.000785,0.003141,0.000000,0.000000,0.000000,0.0,0.0,amp,125.333333,older than 30 months


In [70]:
concat_df = pd.concat([mgx_reshaped, amplicon_reshaped], sort=True).reset_index(drop = True)
concat_df = concat_df.fillna(0)

In [71]:
# re-order columns

cols_to_order = ['sampleid','AgeMonths', 'dev_stage', "method"]
new_columns = cols_to_order + (concat_df.columns.drop(cols_to_order).tolist())
concat_df = concat_df[new_columns]

In [72]:
concat_df.sample(15)

Unnamed: 0,sampleid,AgeMonths,dev_stage,method,Absiella,Acetanaerobacterium,Acetobacter,Achromobacter,Acidaminococcus,Acinetobacter,...,Sutterella,Terrisporobacter,Turicibacter,Turicimonas,Tyzzerella,Ulvibacter,Varibaculum,Veillonella,Victivallis,Virgulinella
22,C0451-2F-1A,102.933333,older than 30 months,mgx,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,7.6e-05,0.0,0.0,0.0,0.0,0.0,0.0
31,C0623-1E-1A,70.266667,older than 30 months,mgx,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.7e-05,0.0,0.0
196,C0754-2F-1A,2.333333,less than 15 months,amp,0.0,0.0,0.0,0.0,0.0,0.00035,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.067082,0.0,0.0
66,C0754-2F-1A,2.333333,less than 15 months,mgx,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038594,0.0,0.0
238,C1034-1F-1A,12.4,less than 15 months,amp,0.0,0.0,0.0,0.0,0.0,0.0,...,0.009239,0.0,0.0,0.0,0.000415,0.0,0.0,0.000943,0.0,0.0
10,C0175-2F-1A,131.1,older than 30 months,mgx,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000453,0.000501,0.0,0.0,0.0,0.0,0.0,0.0
225,C0932-3F-1A,3.0,less than 15 months,amp,0.0,0.0,0.0,0.0,0.023551,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.002474,0.001744,0.0,0.0
175,C0682-3F-1A,6.233333,less than 15 months,amp,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003342,0.0,0.0
30,C0603-4F-1A,11.433333,less than 15 months,mgx,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.00407,0.0,0.0,0.0,0.0,0.00064,0.0,0.0
119,C1099-3F-1A,2.866667,less than 15 months,mgx,0.0,0.0,0.0,0.0,0.0,0.000493,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018479,0.0,0.0


In [73]:
concat_df.to_csv('transposed_mgxamp_df.csv',index=False)