In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import csv
import janitor
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob

In [2]:
# read in metaphlan files data

path = '/Users/danielle/Documents/thesis/theoretical/profiles/*'                     
all_files = [name for name in glob.glob(path)]   

In [3]:
df_from_each_file = []

for f in all_files:
    df = pd.read_csv(f, sep = '\t') # read in dataframe 
    
    f = f.replace("_",'-')
    
    id_1 = f.split('profiles/')[1] # add sample id from filename 
    id_2 = id_1.split('-S')[0]
    df["sampleid"] = id_2
    
    df.rename(columns = {'#SampleID':'taxa', 'Metaphlan2_Analysis':'abundance'}, inplace = True) 
    
    df = df[df['taxa'].str.contains("\|s__")] # only keep genera
    df = df[~df['taxa'].str.contains("\|t__")] 
    df["taxa"] = df['taxa'].str.split("\|g__").str[-1]
    df["taxa"] = df['taxa'].str.split("\|s__").str[0]
    
    # remove unclassified
    df = df[~df.taxa.str.contains("_unclassified")]
    df = df[~df.taxa.str.contains("_noname")]
    
    # convert to relative abundance
    df["abundance"] = df["abundance"]/100.0
    
    df_from_each_file.append(df)

In [4]:
f

'/Users/danielle/Documents/thesis/theoretical/profiles/M1312-1F-1A-S39-profile.tsv'

In [5]:
original = pd.concat(df_from_each_file, ignore_index=True) # concat all dataframes together

In [6]:
original.head()

Unnamed: 0,taxa,abundance,sampleid
0,Bacteroides,0.146484,C0005-3F-1A
1,Bacteroides,0.084487,C0005-3F-1A
2,Blautia,0.072325,C0005-3F-1A
3,Eubacterium,0.068314,C0005-3F-1A
4,Bacteroides,0.067606,C0005-3F-1A


In [7]:
original_reshaped = original.pivot_table(index = "sampleid", values="abundance", columns = "taxa") # pivot
original_reshaped.columns.name = None
original_reshaped = original_reshaped.rename_axis(None, axis=1).reset_index()
original_reshaped = original_reshaped.fillna(0)  # fill in missing values with 0

In [8]:
original_reshaped.head(15)

Unnamed: 0,sampleid,Acidaminococcus,Acinetobacter,Actinobacillus,Actinomyces,Adlercreutzia,Aeromonas,Aggregatibacter,Akkermansia,Alistipes,...,Succinatimonas,Sutterella,T4likevirus,Tannerella,Treponema,Turicibacter,Ureaplasma,Varibaculum,Veillonella,Weissella
0,C0005-3F-1A,0.0,0.0,0.0,0.0,0.000395,0.0,0.0,0.0,0.011947,...,0.0,0.007346,0.0,0.0,0.0,0.000521,0.0,0.0,0.0,0.0
1,C0016-3F-1A,0.0,0.0,0.0,0.0,0.000187,0.0,0.0,0.0025,0.00995,...,0.0,0.008949,0.0,0.0,0.0,0.0,0.0,0.0,0.00043,0.0
2,C0016-4F-1A,0.0,0.0,0.0,0.0,0.00041,0.0,0.0,0.006435,0.013809,...,0.0,0.020971,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,C0017-2F-1A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.024144,0.015922,...,0.0,0.019241,0.0,0.0,0.0,0.0,0.0,0.0,0.000475,0.0
4,C0017-3F-1A,0.0,0.0,0.0,0.0,0.001333,0.0,0.0,0.028821,0.011807,...,0.0,0.017428,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,C0032-9F-1A,0.0,0.0,0.0,0.0,0.002953,0.0,0.0,0.0,0.017328,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,C0043-7F-1A,0.0,0.0,0.0,0.0,0.000359,0.0,0.0,0.003434,0.009745,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000952,0.0
7,C0043-8F-1A,0.0,0.0,0.0,0.0,0.003696,0.0,0.0,0.001576,0.035491,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000505,0.0
8,C0047-7F-1A,0.0,0.0,0.0,0.0,0.000337,0.0,0.0,0.001211,0.00708,...,0.0,0.010724,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,C0052-5F-1A,0.0,0.0,0.0,0.0,0.00045,0.0,0.0,7.6e-05,0.006235,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00021,0.0


In [9]:
# read in meta data
meta = pd.read_csv("~/Documents/thesis/theoretical/sorted_babies.csv", index_col = 0)

In [10]:
meta.head()

Unnamed: 0,sample,subject,timepoint,correctedAgeDays,shannon,reads,AgeMonths,dev_stage,color
1,C0005_3F_1A,5,3,4505.0,3.379592,6193602.0,150.166667,older than 30 months,yellow
2,C0016_3F_1A,16,3,2987.0,3.162439,7626286.0,99.566667,older than 30 months,yellow
3,C0016_4F_1A,16,4,3398.0,3.415994,5497174.0,113.266667,older than 30 months,yellow
4,C0017_2F_1A,17,2,3836.0,3.102341,6240254.0,127.866667,older than 30 months,yellow
5,C0017_3F_1A,17,3,4247.0,3.460197,7764432.0,141.566667,older than 30 months,yellow


In [11]:
meta["sample"] = meta["sample"].str.replace("_",'-')
meta.rename(columns = {'sample':'sampleid', "reads":"read_depth"}, inplace = True) 

In [12]:
meta.head()

Unnamed: 0,sampleid,subject,timepoint,correctedAgeDays,shannon,read_depth,AgeMonths,dev_stage,color
1,C0005-3F-1A,5,3,4505.0,3.379592,6193602.0,150.166667,older than 30 months,yellow
2,C0016-3F-1A,16,3,2987.0,3.162439,7626286.0,99.566667,older than 30 months,yellow
3,C0016-4F-1A,16,4,3398.0,3.415994,5497174.0,113.266667,older than 30 months,yellow
4,C0017-2F-1A,17,2,3836.0,3.102341,6240254.0,127.866667,older than 30 months,yellow
5,C0017-3F-1A,17,3,4247.0,3.460197,7764432.0,141.566667,older than 30 months,yellow


In [13]:
# make age dictionary
agedict = {str(s): {} for s in meta["sampleid"]}
for index, row in meta.iterrows():
    age_months = row["AgeMonths"]
    agedict[row["sampleid"]]= age_months

In [14]:
# read_depth dictionary
readdict = {str(s): {} for s in meta["sampleid"]}
for index, row in meta.iterrows():
    reads = row["read_depth"]
    readdict[row["sampleid"]]= reads

In [15]:
# dev_stage dictionary
dev_stage_dict = {str(s): {} for s in meta["dev_stage"]}
for index, row in meta.iterrows():
    stage = row["dev_stage"]
    dev_stage_dict[row["sampleid"]] = stage

In [16]:
original_reshaped["AgeMonths"]= original_reshaped["sampleid"].map(agedict)
original_reshaped["read_depth"]= original_reshaped["sampleid"].map(readdict)
original_reshaped["dev_stage"]= original_reshaped["sampleid"].map(dev_stage_dict)

In [17]:
original_reshaped.head()

Unnamed: 0,sampleid,Acidaminococcus,Acinetobacter,Actinobacillus,Actinomyces,Adlercreutzia,Aeromonas,Aggregatibacter,Akkermansia,Alistipes,...,Tannerella,Treponema,Turicibacter,Ureaplasma,Varibaculum,Veillonella,Weissella,AgeMonths,read_depth,dev_stage
0,C0005-3F-1A,0.0,0.0,0.0,0.0,0.000395,0.0,0.0,0.0,0.011947,...,0.0,0.0,0.000521,0.0,0.0,0.0,0.0,150.166667,6193602.0,older than 30 months
1,C0016-3F-1A,0.0,0.0,0.0,0.0,0.000187,0.0,0.0,0.0025,0.00995,...,0.0,0.0,0.0,0.0,0.0,0.00043,0.0,99.566667,7626286.0,older than 30 months
2,C0016-4F-1A,0.0,0.0,0.0,0.0,0.00041,0.0,0.0,0.006435,0.013809,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,113.266667,5497174.0,older than 30 months
3,C0017-2F-1A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.024144,0.015922,...,0.0,0.0,0.0,0.0,0.0,0.000475,0.0,127.866667,6240254.0,older than 30 months
4,C0017-3F-1A,0.0,0.0,0.0,0.0,0.001333,0.0,0.0,0.028821,0.011807,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,141.566667,7764432.0,older than 30 months


In [20]:
cols_to_order = ['sampleid', 'read_depth', 'AgeMonths', 'dev_stage']
new_columns = cols_to_order + (original_reshaped.columns.drop(cols_to_order).tolist())
original_reshaped = original_reshaped[new_columns]

In [23]:
# remove mothers
original_reshaped = original_reshaped[~original_reshaped.sampleid.str.startswith("M")]

In [24]:
original_reshaped.sample(15)

Unnamed: 0,sampleid,read_depth,AgeMonths,dev_stage,Acidaminococcus,Acinetobacter,Actinobacillus,Actinomyces,Adlercreutzia,Aeromonas,...,Succinatimonas,Sutterella,T4likevirus,Tannerella,Treponema,Turicibacter,Ureaplasma,Varibaculum,Veillonella,Weissella
345,C1219-3F-1A,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033725,0.0
54,C0194-4F-1A,7805224.0,97.9,older than 30 months,0.0,0.0,0.0,0.0,0.000389,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001026,0.0
151,C0547-2F-1A,8658314.0,78.4,older than 30 months,0.0,0.0,0.0,0.0,0.000455,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000352,0.0
326,C0936-1F-1A,,19.866667,15 to 30 months,0.0,0.0,0.0,0.0,0.000461,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000221,0.0
187,C0603-1F-1A,5907614.0,1.133333,less than 15 months,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001111,0.02723,0.0
205,C0635-1F-1A,9139286.0,46.2,older than 30 months,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.01767,0.0,0.0,0.0,0.0,0.0,0.0,0.000265,0.0
240,C0692-3F-1A,7265908.0,2.533333,less than 15 months,0.003803,0.0,0.0,0.0,0.0,0.0,...,0.0,0.001467,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
202,C0633-2F-1A,12313318.0,19.8,15 to 30 months,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000417,0.0
132,C0516-3F-1A,,,,0.0,0.0,0.0,0.0,0.00048,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000178,0.0
123,C0507-2F-1A,7127138.0,96.333333,older than 30 months,0.0,0.0,0.0,0.0,0.000269,0.0,...,0.0,0.004237,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
original_reshaped.to_csv('theoretical_babies_df.csv')