In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import csv
import janitor
import numpy as np
import os
import glob

In [2]:
# read in original data

path = '/Users/danielle/Documents/thesis/paper-abundance-tables/profiles/*'                     
all_files = [name for name in glob.glob(path)]  

In [3]:
f = all_files[0]

In [4]:
f

'/Users/danielle/Documents/thesis/paper-abundance-tables/profiles/C0047-7E-1A_S66_profile.tsv'

In [5]:
df = pd.read_csv(f, sep = '\t', skiprows=[0,1,2], usecols = ['#clade_name','relative_abundance'])

id_1 = f.split('profiles/')[1] # add sample id from filename 
id_2 = id_1.split('_S')[0]
df["sampleid"] = id_2

df.rename(columns = {'#clade_name':'taxa', 'relative_abundance':'abundance'}, inplace = True) 

df = df[df['taxa'].str.contains("\|s__")] # only keep genera
df = df[~df['taxa'].str.contains("\|t__")] 
df["taxa"] = df['taxa'].str.split("\|g__").str[-1]
df["taxa"] = df['taxa'].str.split("\|s__").str[0]

# remove unclassified
df = df[~df.taxa.str.contains("_unclassified")]
df = df[~df.taxa.str.contains("_noname")]

In [6]:
df.head()

Unnamed: 0,taxa,abundance,sampleid
65,Bacteroides,41.38195,C0047-7E-1A
66,Bacteroides,6.56324,C0047-7E-1A
67,Faecalibacterium,5.03872,C0047-7E-1A
69,Megamonas,4.64487,C0047-7E-1A
70,Bacteroides,4.27382,C0047-7E-1A


In [7]:
df.columns.values

array(['taxa', 'abundance', 'sampleid'], dtype=object)

In [8]:
df = df.groupby(['taxa', 'sampleid'])['abundance'].sum().reset_index()

In [9]:
df["abundance"] = df["abundance"]/100.0

In [10]:
df.head()

Unnamed: 0,taxa,sampleid,abundance
0,Agathobaculum,C0047-7E-1A,0.001349
1,Alistipes,C0047-7E-1A,0.036088
2,Anaerostipes,C0047-7E-1A,0.010047
3,Asaccharobacter,C0047-7E-1A,1.2e-05
4,Bacteroides,C0047-7E-1A,0.65244


In [11]:
df_from_each_file = []
for f in all_files:
    df = pd.read_csv(f, sep = '\t', skiprows=[0,1,2], usecols = ['#clade_name','relative_abundance']) # read in dataframe 
    
    id_1 = f.split('profiles/')[1] # add sample id from filename 
    id_2 = id_1.split('_S')[0]
    df["sampleid"] = id_2
    
    df.rename(columns = {'#clade_name':'taxa', 'relative_abundance':'abundance'}, inplace = True) 
    
    df = df[df['taxa'].str.contains("\|s__")] # only keep genera
    df = df[~df['taxa'].str.contains("\|t__")] 
    df["taxa"] = df['taxa'].str.split("\|g__").str[-1]
    df["taxa"] = df['taxa'].str.split("\|s__").str[0]
    
    # remove unclassified
    df = df[~df.taxa.str.contains("_unclassified")]
    df = df[~df.taxa.str.contains("_noname")]
    
    # combine together taxa of the same genera
    df = df.groupby(['taxa', 'sampleid'])['abundance'].sum().reset_index()    
    
    # convert to relative abundance
    df["abundance"] = df["abundance"]/100.0
    
    df_from_each_file.append(df)

In [12]:
original = pd.concat(df_from_each_file, ignore_index=True) # concat all dataframes together

In [13]:
original.head()


Unnamed: 0,taxa,sampleid,abundance
0,Agathobaculum,C0047-7E-1A,0.001349
1,Alistipes,C0047-7E-1A,0.036088
2,Anaerostipes,C0047-7E-1A,0.010047
3,Asaccharobacter,C0047-7E-1A,1.2e-05
4,Bacteroides,C0047-7E-1A,0.65244


In [14]:
original_reshaped = original.pivot_table(index = "sampleid", values="abundance", columns = "taxa") # pivot

In [15]:
original_reshaped.columns.name = None

In [16]:
original_reshaped = original_reshaped.rename_axis(None, axis=1).reset_index()

In [17]:
original_reshaped = original_reshaped.fillna(0)  # fill in missing values with 0

In [18]:
original_reshaped

Unnamed: 0,sampleid,Absiella,Acetobacter,Acidaminococcus,Acinetobacter,Actinomyces,Actinotignum,Adlercreutzia,Aeriscardovia,Aeromonas,...,Stenotrophomonas,Streptococcus,Sutterella,Terrisporobacter,Turicibacter,Turicimonas,Tyzzerella,Varibaculum,Veillonella,Victivallis
0,C0047-7E-1A,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.000000,0.000156,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,C0052-7E-1A,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.001675,0.000000,0.0,0.002813,0.000000,0.000000,0.000000,0.000000,0.000000
2,C0053-8E-1A,0.0,0.0,0.0,0.000000,0.001754,0.000000,0.000623,0.000000,0.0,...,0.0,0.008166,0.000000,0.0,0.000670,0.000062,0.001062,0.000000,0.000000,0.000000
3,C0058-4E-1A,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.003549,0.000000,0.0,0.002141,0.000000,0.000000,0.000000,0.001234,0.000000
4,C0059-4E-1A,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.005600,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,C0106-6E-1A,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000396,0.000000,0.0,...,0.0,0.035666,0.000000,0.0,0.002481,0.000000,0.000029,0.000000,0.000235,0.000000
6,C0107-4E-1A,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000386,0.000000,0.0,...,0.0,0.008849,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,C0123-3E-1A,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000380,0.000000,0.0,...,0.0,0.002299,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,C0126-9E-1A,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000268,0.000000,0.0,...,0.0,0.010257,0.000000,0.0,0.000000,0.000103,0.000000,0.000000,0.000000,0.000000
9,C0127-7E-1A,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000288,0.000000,0.0,...,0.0,0.006092,0.000000,0.0,0.000000,0.000218,0.000088,0.000000,0.000000,0.000000


In [19]:
# adding age metadata
age = pd.read_csv("/Users/danielle/Documents/thesis/paper-abundance-tables/metadata/joined.csv")

In [20]:
age.head()

Unnamed: 0,sample,subject,timepoint,batch,childGender,correctedAgeDays,mother_HHS,childBMI,hasScan,mcDespotProcessed,...,rt_Pallidum,rt_Hippocampus,rt_Amygdala,rt_Insula,rt_Operculum,hires_total,cerebellar,neocortical,limbic,subcortical
0,C0052_5F_1A,52,5,1,Male,2824.0,6.0,16.728348,1.0,1.0,...,1315.875122,2491.843506,1107.90625,536.9375,570.96875,264479.536024,89286.65625,127223.941542,34246.781738,47968.938232
1,C0126_8F_1A,126,8,1,Male,2261.0,7.0,,1.0,0.0,...,1486.03125,2737.625,1210.000122,680.625,616.34375,320037.446868,100407.316406,163357.565277,41079.502685,56272.565185
2,C0126_8F_1B,126,8,1,Male,2261.0,7.0,,1.0,0.0,...,1486.03125,2737.625,1210.000122,680.625,616.34375,320037.446868,100407.316406,163357.565277,41079.502685,56272.565185
3,C0127_6F_1B,127,6,1,Female,3367.0,7.0,,1.0,1.0,...,1338.5625,2230.9375,1089.0,654.156311,567.1875,270177.880217,86435.597656,135330.938994,34012.343781,48411.343567
4,C0286_5F_1A,286,5,1,Male,1926.0,6.0,,1.0,0.0,...,1603.25,2457.8125,1070.09375,616.34375,548.28125,284418.066527,88447.21875,143566.50244,36700.813782,52404.345337


In [21]:
# remove shannon column, change characters in sample names 

age["sample"] = age["sample"].str.replace("_",'-')

In [22]:
age["AgeMonths"] = age["correctedAgeDays"]/30.0
age.rename(columns = {'sample':'sampleid'}, inplace = True) 

In [23]:
# make age dictionary
agedict = {str(s): {} for s in age["sampleid"]}
for index, row in age.iterrows():
    age_months = row["AgeMonths"]
    agedict[row["sampleid"]]= age_months

In [24]:
original_reshaped["AgeMonths"]= original_reshaped["sampleid"].map(agedict)

In [25]:
def assign_dev_stage(x):
    if x<15:
        return "less than 15 months"
    elif 15 <=x<= 30:
        return "between 15 and 30 months"
    elif x>30:
        return "older than 30 months"

In [26]:
original_reshaped['dev_stage'] = original_reshaped['AgeMonths'].apply(assign_dev_stage)

In [27]:
original_reshaped

Unnamed: 0,sampleid,Absiella,Acetobacter,Acidaminococcus,Acinetobacter,Actinomyces,Actinotignum,Adlercreutzia,Aeriscardovia,Aeromonas,...,Sutterella,Terrisporobacter,Turicibacter,Turicimonas,Tyzzerella,Varibaculum,Veillonella,Victivallis,AgeMonths,dev_stage
0,C0047-7E-1A,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000156,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,97.233333,older than 30 months
1,C0052-7E-1A,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.0,0.002813,0.000000,0.000000,0.000000,0.000000,0.000000,127.333333,older than 30 months
2,C0053-8E-1A,0.0,0.0,0.0,0.000000,0.001754,0.000000,0.000623,0.000000,0.0,...,0.000000,0.0,0.000670,0.000062,0.001062,0.000000,0.000000,0.000000,105.700000,older than 30 months
3,C0058-4E-1A,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.0,0.002141,0.000000,0.000000,0.000000,0.001234,0.000000,137.766667,older than 30 months
4,C0059-4E-1A,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,137.766667,older than 30 months
5,C0106-6E-1A,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000396,0.000000,0.0,...,0.000000,0.0,0.002481,0.000000,0.000029,0.000000,0.000235,0.000000,115.466667,older than 30 months
6,C0107-4E-1A,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000386,0.000000,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,115.466667,older than 30 months
7,C0123-3E-1A,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000380,0.000000,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,93.066667,older than 30 months
8,C0126-9E-1A,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000268,0.000000,0.0,...,0.000000,0.0,0.000000,0.000103,0.000000,0.000000,0.000000,0.000000,88.466667,older than 30 months
9,C0127-7E-1A,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000288,0.000000,0.0,...,0.000000,0.0,0.000000,0.000218,0.000088,0.000000,0.000000,0.000000,125.333333,older than 30 months


In [28]:
# read in 16S data

In [29]:
# re-order columns

cols_to_order = ['sampleid','AgeMonths', 'dev_stage']
new_columns = cols_to_order + (concat_df.columns.drop(cols_to_order).tolist())
concat_df = concat_df[new_columns]

NameError: name 'concat_df' is not defined

In [None]:
original_reshaped.to_csv('paper_abund_df.csv')